xref: /spdk/scripts/setup.sh (revision 2f557958d0762ad00e068f997e2d25a205ded4b7)
1#!/usr/bin/env bash
2
3set -e
4
5rootdir=$(readlink -f $(dirname $0))/..
6source "$rootdir/scripts/common.sh"
7
8function usage()
9{
10	if [ `uname` = Linux ]; then
11		options="[config|reset|status|cleanup|help]"
12	else
13		options="[config|reset|help]"
14	fi
15
16	[[ ! -z $2 ]] && ( echo "$2"; echo ""; )
17	echo "Helper script for allocating hugepages and binding NVMe, I/OAT and Virtio devices to"
18	echo "a generic VFIO kernel driver. If VFIO is not available on the system, this script will"
19	echo "fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
20	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
21	echo "Usage: $(basename $1) $options"
22	echo
23	echo "$options - as following:"
24	echo "config            Default mode. Allocate hugepages and bind PCI devices."
25	if [ `uname` = Linux ]; then
26		echo "cleanup            Remove any orphaned files that can be left in the system after SPDK application exit"
27	fi
28	echo "reset             Rebind PCI devices back to their original drivers."
29	echo "                  Also cleanup any leftover spdk files/resources."
30	echo "                  Hugepage memory size will remain unchanged."
31	if [ `uname` = Linux ]; then
32		echo "status            Print status of all SPDK-compatible devices on the system."
33	fi
34	echo "help              Print this help message."
35	echo
36	echo "The following environment variables can be specified."
37	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
38	echo "                  For NUMA systems, the hugepages will be evenly distributed"
39	echo "                  between CPU nodes"
40	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
41	echo "HUGENODE          Specific NUMA node to allocate hugepages on. To allocate"
42	echo "                  hugepages on multiple nodes run this script multiple times -"
43	echo "                  once for each node."
44	echo "PCI_WHITELIST     Whitespace separated list of PCI devices (NVMe, I/OAT, Virtio) to bind."
45	echo "                  Each device must be specified as a full PCI address."
46	echo "                  E.g. PCI_WHITELIST=\"0000:01:00.0 0000:02:00.0\""
47	echo "                  To blacklist all PCI devices use a non-valid address."
48	echo "                  E.g. PCI_WHITELIST=\"none\""
49	echo "                  If empty or unset, all PCI devices will be bound."
50	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
51	echo "                  By default the current user will be used."
52	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
53	echo "                  bind devices to the given driver."
54	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=vfio-pci"
55	exit 0
56}
57
58# In monolithic kernels the lsmod won't work. So
59# back that with a /sys/modules check. Return a different code for
60# built-in vs module just in case we want that down the road.
61function check_for_driver {
62	$(lsmod | grep $1 > /dev/null)
63	if [ $? -eq 0 ]; then
64		return 1
65	else
66		if [[ -d /sys/module/$1 ]]; then
67			return 2
68		else
69			return 0
70		fi
71	fi
72	return 0
73}
74
75function pci_can_bind() {
76	if [[ ${#PCI_WHITELIST[@]} == 0 ]]; then
77		#no whitelist specified, bind all devices
78		return 1
79	fi
80
81	for i in ${PCI_WHITELIST[@]}
82	do
83		if [ "$i" == "$1" ] ; then
84			 return 1
85		fi
86	done
87	return 0
88}
89
90function linux_bind_driver() {
91	bdf="$1"
92	driver_name="$2"
93	old_driver_name="no driver"
94	ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /')
95
96	if [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then
97		old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver))
98
99		if [ "$driver_name" = "$old_driver_name" ]; then
100			return 0
101		fi
102
103		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
104		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
105	fi
106
107	echo "$bdf ($ven_dev_id): $old_driver_name -> $driver_name"
108
109	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
110	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
111
112	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
113	if [ -e "/dev/vfio/$iommu_group" ]; then
114		if [ -n "$TARGET_USER" ]; then
115			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
116		fi
117	fi
118}
119
120function linux_unbind_driver() {
121	bdf="$1"
122	ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /')
123
124	if ! [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then
125		return 0
126	fi
127
128	old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver))
129
130	echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
131	echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
132	echo "$bdf ($ven_dev_id): $old_driver_name -> no driver"
133}
134
135function linux_hugetlbfs_mounts() {
136	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
137}
138
139function get_nvme_name_from_bdf {
140	set +e
141	nvme_devs=`lsblk -d --output NAME | grep "^nvme"`
142	set -e
143	for dev in $nvme_devs; do
144		link_name=$(readlink /sys/block/$dev/device/device) || true
145		if [ -z "$link_name" ]; then
146			link_name=$(readlink /sys/block/$dev/device)
147		fi
148		link_bdf=$(basename "$link_name")
149		if [ "$link_bdf" = "$1" ]; then
150			eval "$2=$dev"
151			return
152		fi
153	done
154}
155
156function get_virtio_names_from_bdf {
157	blk_devs=`lsblk --nodeps --output NAME`
158	virtio_names=''
159
160	for dev in $blk_devs; do
161		if readlink "/sys/block/$dev" | grep -q "$1"; then
162			virtio_names="$virtio_names $dev"
163		fi
164	done
165
166	eval "$2='$virtio_names'"
167}
168
169function configure_linux_pci {
170	if [ -z "${DRIVER_OVERRIDE}" ]; then
171		driver_name=vfio-pci
172		if [ -z "$(ls /sys/kernel/iommu_groups)" ]; then
173			# No IOMMU. Use uio.
174			driver_name=uio_pci_generic
175		fi
176	else
177		driver_name="${DRIVER_OVERRIDE}"
178	fi
179
180	# NVMe
181	modprobe $driver_name
182	for bdf in $(iter_pci_class_code 01 08 02); do
183		blkname=''
184		get_nvme_name_from_bdf "$bdf" blkname
185		if pci_can_bind $bdf == "0" ; then
186			echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)"
187			continue
188		fi
189		if [ "$blkname" != "" ]; then
190			mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
191		else
192			mountpoints="0"
193		fi
194		if [ "$mountpoints" = "0" ]; then
195			linux_bind_driver "$bdf" "$driver_name"
196		else
197			echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf
198		fi
199	done
200
201	# IOAT
202	TMP=`mktemp`
203	#collect all the device_id info of ioat devices.
204	grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
205	| awk -F"x" '{print $2}' > $TMP
206
207	for dev_id in `cat $TMP`; do
208		for bdf in $(iter_pci_dev_id 8086 $dev_id); do
209			if pci_can_bind $bdf == "0" ; then
210				echo "Skipping un-whitelisted I/OAT device at $bdf"
211				continue
212			fi
213
214			linux_bind_driver "$bdf" "$driver_name"
215		done
216	done
217	rm $TMP
218
219	# virtio
220	TMP=`mktemp`
221	#collect all the device_id info of virtio devices.
222	grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
223	| awk -F"x" '{print $2}' > $TMP
224
225	for dev_id in `cat $TMP`; do
226		for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
227			if pci_can_bind $bdf == "0" ; then
228				echo "Skipping un-whitelisted Virtio device at $bdf"
229				continue
230			fi
231			blknames=''
232			get_virtio_names_from_bdf "$bdf" blknames
233			for blkname in $blknames; do
234				if mount | grep -q "/dev/$blkname"; then
235					echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf
236					continue 2
237				fi
238			done
239
240			linux_bind_driver "$bdf" "$driver_name"
241		done
242	done
243	rm $TMP
244
245	echo "1" > "/sys/bus/pci/rescan"
246}
247
248function cleanup_linux {
249	shopt -s extglob nullglob
250	dirs_to_clean=""
251	dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) "
252	if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then
253		dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) "
254	fi
255
256	files_to_clean=""
257	for dir in $dirs_to_clean; do
258		files_to_clean+="$(echo $dir/*) "
259	done
260	shopt -u extglob nullglob
261
262	files_to_clean+="$(echo /dev/shm/* | egrep '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevtest|bdevperf)_trace|spdk_iscsi_conns' || true) "
263	files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)"
264	if [[ -z "$files_to_clean" ]]; then
265		echo "Clean"
266		return 0;
267	fi
268
269	shopt -s extglob
270	for fd_dir in $(echo /proc/+([0-9])); do
271		opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)"
272	done
273	shopt -u extglob
274
275	if [[ -z "$opened_files" ]]; then
276		echo "Can't get list of opened files!"
277		exit 1
278	fi
279
280	echo 'Cleaning'
281	for f in $files_to_clean; do
282		if ! echo "$opened_files" | egrep -q "^$f\$"; then
283			echo "Removing:    $f"
284			rm $f
285		else
286			echo "Still open: $f"
287		fi
288	done
289
290	for dir in $dirs_to_clean; do
291	if ! echo "$opened_files" | egrep -q "^$dir\$"; then
292		echo "Removing:    $dir"
293		rmdir $dir
294	else
295		echo "Still open: $dir"
296	fi
297	done
298	echo "Clean"
299
300	unset dirs_to_clean files_to_clean opened_files
301}
302
303function configure_linux {
304	configure_linux_pci
305	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
306
307	if [ -z "$hugetlbfs_mounts" ]; then
308		hugetlbfs_mounts=/mnt/huge
309		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
310		mkdir -p "$hugetlbfs_mounts"
311		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
312	fi
313
314	if [ -z "$HUGENODE" ]; then
315		hugepages_target="/proc/sys/vm/nr_hugepages"
316	else
317		hugepages_target="/sys/devices/system/node/node${HUGENODE}/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages"
318	fi
319
320	echo "$NRHUGE" > "$hugepages_target"
321	allocated_hugepages=`cat $hugepages_target`
322	if [ "$allocated_hugepages" -lt "$NRHUGE" ]; then
323		echo ""
324		echo "## ERROR: requested $NRHUGE hugepages but only $allocated_hugepages could be allocated."
325		echo "## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine."
326		exit 1
327	fi
328
329	if [ "$driver_name" = "vfio-pci" ]; then
330		if [ -n "$TARGET_USER" ]; then
331			for mount in $hugetlbfs_mounts; do
332				chown "$TARGET_USER" "$mount"
333				chmod g+w "$mount"
334			done
335		fi
336
337		MEMLOCK_AMNT=`ulimit -l`
338		if [ "$MEMLOCK_AMNT" != "unlimited" ] ; then
339			MEMLOCK_MB=$(( $MEMLOCK_AMNT / 1024 ))
340			echo ""
341			echo "Current user memlock limit: ${MEMLOCK_MB} MB"
342			echo ""
343			echo "This is the maximum amount of memory you will be"
344			echo "able to use with DPDK and VFIO if run as current user."
345			echo -n "To change this, please adjust limits.conf memlock "
346			echo "limit for current user."
347
348			if [ $MEMLOCK_AMNT -lt 65536 ] ; then
349				echo ""
350				echo "## WARNING: memlock limit is less than 64MB"
351				echo -n "## DPDK with VFIO may not be able to initialize "
352				echo "if run as current user."
353			fi
354		fi
355	fi
356}
357
358function reset_linux_pci {
359	# NVMe
360	set +e
361	check_for_driver nvme
362	driver_loaded=$?
363	set -e
364	for bdf in $(iter_pci_class_code 01 08 02); do
365		if pci_can_bind $bdf == "0" ; then
366			echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)"
367			continue
368		fi
369		if [ $driver_loaded -ne 0 ]; then
370			linux_bind_driver "$bdf" nvme
371		else
372			linux_unbind_driver "$bdf"
373		fi
374	done
375
376	# IOAT
377	TMP=`mktemp`
378	#collect all the device_id info of ioat devices.
379	grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
380	| awk -F"x" '{print $2}' > $TMP
381
382	set +e
383	check_for_driver ioatdma
384	driver_loaded=$?
385	set -e
386	for dev_id in `cat $TMP`; do
387		for bdf in $(iter_pci_dev_id 8086 $dev_id); do
388			if pci_can_bind $bdf == "0" ; then
389				echo "Skipping un-whitelisted I/OAT device at $bdf"
390				continue
391			fi
392			if [ $driver_loaded -ne 0 ]; then
393				linux_bind_driver "$bdf" ioatdma
394			else
395				linux_unbind_driver "$bdf"
396			fi
397		done
398	done
399	rm $TMP
400
401	# virtio
402	TMP=`mktemp`
403	#collect all the device_id info of virtio devices.
404	grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
405	| awk -F"x" '{print $2}' > $TMP
406
407	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
408	# Requires some more investigation - for example, some kernels do not seem to have
409	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
410	#  underscore vs. dash right in the virtio_scsi name.
411	modprobe virtio-pci || true
412	for dev_id in `cat $TMP`; do
413		for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
414			if pci_can_bind $bdf == "0" ; then
415				echo "Skipping un-whitelisted Virtio device at $bdf"
416				continue
417			fi
418			linux_bind_driver "$bdf" virtio-pci
419		done
420	done
421	rm $TMP
422
423	echo "1" > "/sys/bus/pci/rescan"
424}
425
426function reset_linux {
427	reset_linux_pci
428	for mount in $(linux_hugetlbfs_mounts); do
429		rm -f "$mount"/spdk*map_*
430	done
431	rm -f /run/.spdk*
432}
433
434function status_linux {
435	echo "Hugepages"
436	printf "%-6s %10s %8s / %6s\n" "node" "hugesize"  "free" "total"
437
438	numa_nodes=0
439	shopt -s nullglob
440	for path in /sys/devices/system/node/node?/hugepages/hugepages-*/; do
441		numa_nodes=$((numa_nodes + 1))
442		free_pages=`cat $path/free_hugepages`
443		all_pages=`cat $path/nr_hugepages`
444
445		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
446
447		node=${BASH_REMATCH[1]}
448		huge_size=${BASH_REMATCH[2]}
449
450		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
451	done
452	shopt -u nullglob
453
454	# fall back to system-wide hugepages
455	if [ "$numa_nodes" = "0" ]; then
456		free_pages=`grep HugePages_Free /proc/meminfo | awk '{ print $2 }'`
457		all_pages=`grep HugePages_Total /proc/meminfo | awk '{ print $2 }'`
458		node="-"
459		huge_size="$HUGEPGSZ"
460
461		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
462	fi
463
464	echo "NVMe devices"
465
466	echo -e "BDF\t\tNuma Node\tDriver name\t\tDevice name"
467	for bdf in $(iter_pci_class_code 01 08 02); do
468		driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
469		node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
470		if [ "$driver" = "nvme" -a -d /sys/bus/pci/devices/$bdf/nvme ]; then
471			name="\t"`ls /sys/bus/pci/devices/$bdf/nvme`;
472		else
473			name="-";
474		fi
475		echo -e "$bdf\t$node\t\t$driver\t\t$name";
476	done
477
478	echo "I/OAT DMA"
479
480	#collect all the device_id info of ioat devices.
481	TMP=`grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
482	| awk -F"x" '{print $2}'`
483	echo -e "BDF\t\tNuma Node\tDriver Name"
484	for dev_id in $TMP; do
485		for bdf in $(iter_pci_dev_id 8086 $dev_id); do
486			driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
487			node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
488			echo -e "$bdf\t$node\t\t$driver"
489		done
490	done
491
492	echo "virtio"
493
494	#collect all the device_id info of virtio devices.
495	TMP=`grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
496	| awk -F"x" '{print $2}'`
497	echo -e "BDF\t\tNuma Node\tDriver Name\t\tDevice Name"
498	for dev_id in $TMP; do
499		for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
500			driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
501			node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
502			blknames=''
503			get_virtio_names_from_bdf "$bdf" blknames
504			echo -e "$bdf\t$node\t\t$driver\t\t$blknames"
505		done
506	done
507}
508
509function configure_freebsd_pci {
510	TMP=`mktemp`
511
512	# NVMe
513	GREP_STR="class=0x010802"
514
515	# IOAT
516	grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
517	| awk -F"x" '{print $2}' > $TMP
518	for dev_id in `cat $TMP`; do
519		GREP_STR="${GREP_STR}\|chip=0x${dev_id}8086"
520	done
521
522	AWK_PROG="{if (count > 0) printf \",\"; printf \"%s:%s:%s\",\$2,\$3,\$4; count++}"
523	echo $AWK_PROG > $TMP
524
525	BDFS=`pciconf -l | grep "${GREP_STR}" | awk -F: -f $TMP`
526
527	kldunload nic_uio.ko || true
528	kenv hw.nic_uio.bdfs=$BDFS
529	kldload nic_uio.ko
530	rm $TMP
531}
532
533function configure_freebsd {
534	configure_freebsd_pci
535	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
536	#  previous value, unload contigmem so that we can reload with the new value.
537	if kldstat -q -m contigmem; then
538		if [ `kenv hw.contigmem.num_buffers` -ne "$((HUGEMEM / 256))" ]; then
539			kldunload contigmem.ko
540		fi
541	fi
542	if ! kldstat -q -m contigmem; then
543		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
544		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
545		kldload contigmem.ko
546	fi
547}
548
549function reset_freebsd {
550	kldunload contigmem.ko || true
551	kldunload nic_uio.ko || true
552}
553
554mode=$1
555
556if [ -z "$mode" ]; then
557	mode="config"
558fi
559
560: ${HUGEMEM:=2048}
561: ${PCI_WHITELIST:=""}
562
563if [ -n "$NVME_WHITELIST" ]; then
564	PCI_WHITELIST="$PCI_WHITELIST $NVME_WHITELIST"
565fi
566
567if [ -n "$SKIP_PCI" ]; then
568	PCI_WHITELIST="none"
569fi
570
571declare -a PCI_WHITELIST=(${PCI_WHITELIST})
572
573if [ -z "$TARGET_USER" ]; then
574	TARGET_USER="$SUDO_USER"
575	if [ -z "$TARGET_USER" ]; then
576		TARGET_USER=`logname 2>/dev/null` || true
577	fi
578fi
579
580if [ `uname` = Linux ]; then
581	HUGEPGSZ=$(( `grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9'` ))
582	HUGEPGSZ_MB=$(( $HUGEPGSZ / 1024 ))
583	: ${NRHUGE=$(( (HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB ))}
584
585	if [ "$mode" == "config" ]; then
586		configure_linux
587	elif [ "$mode" == "cleanup" ]; then
588		cleanup_linux
589	elif [ "$mode" == "reset" ]; then
590		reset_linux
591	elif [ "$mode" == "status" ]; then
592		status_linux
593	elif [ "$mode" == "help" ]; then
594		usage $0
595	else
596		usage $0 "Invalid argument '$mode'"
597	fi
598else
599	if [ "$mode" == "config" ]; then
600		configure_freebsd
601	elif [ "$mode" == "reset" ]; then
602		reset_freebsd
603	elif [ "$mode" == "cleanup" ]; then
604		echo "setup.sh cleanup function not yet supported on $(uname)"
605	elif [ "$mode" == "status" ]; then
606		echo "setup.sh status function not yet supported on $(uname)"
607	elif [ "$mode" == "help" ]; then
608		usage $0
609	else
610		usage $0 "Invalid argument '$mode'"
611	fi
612fi
613