xref: /spdk/scripts/setup.sh (revision a83f91c29a4740e4bea5f9509b7036e9e7dc2788)
1#!/usr/bin/env bash
2
3set -e
4
5rootdir=$(readlink -f $(dirname $0))/..
6source "$rootdir/scripts/common.sh"
7
8function usage()
9{
10	if [ `uname` = Linux ]; then
11		options="[config|reset|status|help]"
12	else
13		options="[config|reset|help]"
14	fi
15
16	[[ ! -z $2 ]] && ( echo "$2"; echo ""; )
17	echo "Helper script for allocating hugepages and binding NVMe, I/OAT and Virtio devices to"
18	echo "a generic VFIO kernel driver. If VFIO is not available on the system, this script will"
19	echo "fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
20	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
21	echo "Usage: $(basename $1) $options"
22	echo
23	echo "$options - as following:"
24	echo "config            Default mode. Allocate hugepages and bind PCI devices."
25	echo "reset             Rebind PCI devices back to their original drivers."
26	echo "                  Also cleanup any leftover spdk files/resources."
27	echo "                  Hugepage memory size will remain unchanged."
28	if [ `uname` = Linux ]; then
29		echo "status            Print status of all SPDK-compatible devices on the system."
30	fi
31	echo "help              Print this help message."
32	echo
33	echo "The following environment variables can be specified."
34	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
35	echo "                  For NUMA systems, the hugepages will be evenly distributed"
36	echo "                  between CPU nodes"
37	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
38	echo "HUGENODE          Specific NUMA node to allocate hugepages on. To allocate"
39	echo "                  hugepages on multiple nodes run this script multiple times -"
40	echo "                  once for each node."
41	echo "PCI_WHITELIST     Whitespace separated list of PCI devices (NVMe, I/OAT, Virtio) to bind."
42	echo "                  Each device must be specified as a full PCI address."
43	echo "                  E.g. PCI_WHITELIST=\"0000:01:00.0 0000:02:00.0\""
44	echo "                  To blacklist all PCI devices use a non-valid address."
45	echo "                  E.g. PCI_WHITELIST=\"none\""
46	echo "                  If empty or unset, all PCI devices will be bound."
47	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
48	echo "                  By default the current user will be used."
49	exit 0
50}
51
52# In monolithic kernels the lsmod won't work. So
53# back that with a /sys/modules check. Return a different code for
54# built-in vs module just in case we want that down the road.
55function check_for_driver {
56	$(lsmod | grep $1 > /dev/null)
57	if [ $? -eq 0 ]; then
58		return 1
59	else
60		if [[ -d /sys/module/$1 ]]; then
61			return 2
62		else
63			return 0
64		fi
65	fi
66	return 0
67}
68
69function pci_can_bind() {
70	if [[ ${#PCI_WHITELIST[@]} == 0 ]]; then
71		#no whitelist specified, bind all devices
72		return 1
73	fi
74
75	for i in ${PCI_WHITELIST[@]}
76	do
77		if [ "$i" == "$1" ] ; then
78			 return 1
79		fi
80	done
81	return 0
82}
83
84function linux_bind_driver() {
85	bdf="$1"
86	driver_name="$2"
87	old_driver_name="no driver"
88	ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /')
89
90	if [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then
91		old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver))
92
93		if [ "$driver_name" = "$old_driver_name" ]; then
94			return 0
95		fi
96
97		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
98		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
99	fi
100
101	echo "$bdf ($ven_dev_id): $old_driver_name -> $driver_name"
102
103	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
104	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
105
106	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
107	if [ -e "/dev/vfio/$iommu_group" ]; then
108		if [ -n "$TARGET_USER" ]; then
109			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
110		fi
111	fi
112}
113
114function linux_unbind_driver() {
115	bdf="$1"
116	ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /')
117
118	if ! [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then
119		return 0
120	fi
121
122	old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver))
123
124	echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
125	echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
126	echo "$bdf ($ven_dev_id): $old_driver_name -> no driver"
127}
128
129function linux_hugetlbfs_mounts() {
130	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
131}
132
133function get_nvme_name_from_bdf {
134	set +e
135	nvme_devs=`lsblk -d --output NAME | grep "^nvme"`
136	set -e
137	for dev in $nvme_devs; do
138		link_name=$(readlink /sys/block/$dev/device/device) || true
139		if [ -z "$link_name" ]; then
140			link_name=$(readlink /sys/block/$dev/device)
141		fi
142		link_bdf=$(basename "$link_name")
143		if [ "$link_bdf" = "$1" ]; then
144			eval "$2=$dev"
145			return
146		fi
147	done
148}
149
150function get_virtio_names_from_bdf {
151	blk_devs=`lsblk --nodeps --output NAME`
152	virtio_names=''
153
154	for dev in $blk_devs; do
155		if readlink "/sys/block/$dev" | grep -q "$1"; then
156			virtio_names="$virtio_names $dev"
157		fi
158	done
159
160	eval "$2='$virtio_names'"
161}
162
163function configure_linux_pci {
164	driver_name=vfio-pci
165	if [ -z "$(ls /sys/kernel/iommu_groups)" ]; then
166		# No IOMMU. Use uio.
167		driver_name=uio_pci_generic
168	fi
169
170	# NVMe
171	modprobe $driver_name || true
172	for bdf in $(iter_pci_class_code 01 08 02); do
173		blkname=''
174		get_nvme_name_from_bdf "$bdf" blkname
175		if pci_can_bind $bdf == "0" ; then
176			echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)"
177			continue
178		fi
179		if [ "$blkname" != "" ]; then
180			mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
181		else
182			mountpoints="0"
183		fi
184		if [ "$mountpoints" = "0" ]; then
185			linux_bind_driver "$bdf" "$driver_name"
186		else
187			echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf
188		fi
189	done
190
191	# IOAT
192	TMP=`mktemp`
193	#collect all the device_id info of ioat devices.
194	grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
195	| awk -F"x" '{print $2}' > $TMP
196
197	for dev_id in `cat $TMP`; do
198		for bdf in $(iter_pci_dev_id 8086 $dev_id); do
199			if pci_can_bind $bdf == "0" ; then
200				echo "Skipping un-whitelisted I/OAT device at $bdf"
201				continue
202			fi
203			linux_bind_driver "$bdf" "$driver_name"
204		done
205	done
206	rm $TMP
207
208	# virtio
209	TMP=`mktemp`
210	#collect all the device_id info of virtio devices.
211	grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
212	| awk -F"x" '{print $2}' > $TMP
213
214	for dev_id in `cat $TMP`; do
215		for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
216			if pci_can_bind $bdf == "0" ; then
217				echo "Skipping un-whitelisted Virtio device at $bdf"
218				continue
219			fi
220			blknames=''
221			get_virtio_names_from_bdf "$bdf" blknames
222			for blkname in $blknames; do
223				if mount | grep -q "/dev/$blkname"; then
224					echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf
225					continue 2
226				fi
227			done
228
229			linux_bind_driver "$bdf" "$driver_name"
230		done
231	done
232	rm $TMP
233
234	echo "1" > "/sys/bus/pci/rescan"
235}
236
237function configure_linux {
238	configure_linux_pci
239	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
240
241	if [ -z "$hugetlbfs_mounts" ]; then
242		hugetlbfs_mounts=/mnt/huge
243		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
244		mkdir -p "$hugetlbfs_mounts"
245		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
246	fi
247
248	if [ -z "$HUGENODE" ]; then
249		hugepages_target="/proc/sys/vm/nr_hugepages"
250	else
251		hugepages_target="/sys/devices/system/node/node${HUGENODE}/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages"
252	fi
253
254	echo "$NRHUGE" > "$hugepages_target"
255	allocated_hugepages=`cat $hugepages_target`
256	if [ "$allocated_hugepages" -lt "$NRHUGE" ]; then
257		echo ""
258		echo "## ERROR: requested $NRHUGE hugepages but only $allocated_hugepages could be allocated."
259		echo "## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine."
260		exit 1
261	fi
262
263	if [ "$driver_name" = "vfio-pci" ]; then
264		if [ -n "$TARGET_USER" ]; then
265			for mount in $hugetlbfs_mounts; do
266				chown "$TARGET_USER" "$mount"
267				chmod g+w "$mount"
268			done
269		fi
270
271		MEMLOCK_AMNT=`ulimit -l`
272		if [ "$MEMLOCK_AMNT" != "unlimited" ] ; then
273			MEMLOCK_MB=$(( $MEMLOCK_AMNT / 1024 ))
274			echo ""
275			echo "Current user memlock limit: ${MEMLOCK_MB} MB"
276			echo ""
277			echo "This is the maximum amount of memory you will be"
278			echo "able to use with DPDK and VFIO if run as current user."
279			echo -n "To change this, please adjust limits.conf memlock "
280			echo "limit for current user."
281
282			if [ $MEMLOCK_AMNT -lt 65536 ] ; then
283				echo ""
284				echo "## WARNING: memlock limit is less than 64MB"
285				echo -n "## DPDK with VFIO may not be able to initialize "
286				echo "if run as current user."
287			fi
288		fi
289	fi
290}
291
292function reset_linux_pci {
293	# NVMe
294	set +e
295	check_for_driver nvme
296	driver_loaded=$?
297	set -e
298	for bdf in $(iter_pci_class_code 01 08 02); do
299		if pci_can_bind $bdf == "0" ; then
300			echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)"
301			continue
302		fi
303		if [ $driver_loaded -ne 0 ]; then
304			linux_bind_driver "$bdf" nvme
305		else
306			linux_unbind_driver "$bdf"
307		fi
308	done
309
310	# IOAT
311	TMP=`mktemp`
312	#collect all the device_id info of ioat devices.
313	grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
314	| awk -F"x" '{print $2}' > $TMP
315
316	set +e
317	check_for_driver ioatdma
318	driver_loaded=$?
319	set -e
320	for dev_id in `cat $TMP`; do
321		for bdf in $(iter_pci_dev_id 8086 $dev_id); do
322			if pci_can_bind $bdf == "0" ; then
323				echo "Skipping un-whitelisted I/OAT device at $bdf"
324				continue
325			fi
326			if [ $driver_loaded -ne 0 ]; then
327				linux_bind_driver "$bdf" ioatdma
328			else
329				linux_unbind_driver "$bdf"
330			fi
331		done
332	done
333	rm $TMP
334
335	# virtio
336	TMP=`mktemp`
337	#collect all the device_id info of virtio devices.
338	grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
339	| awk -F"x" '{print $2}' > $TMP
340
341	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
342	# Requires some more investigation - for example, some kernels do not seem to have
343	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
344	#  underscore vs. dash right in the virtio_scsi name.
345	modprobe virtio-pci || true
346	for dev_id in `cat $TMP`; do
347		for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
348			if pci_can_bind $bdf == "0" ; then
349				echo "Skipping un-whitelisted Virtio device at $bdf"
350				continue
351			fi
352			linux_bind_driver "$bdf" virtio-pci
353		done
354	done
355	rm $TMP
356
357	echo "1" > "/sys/bus/pci/rescan"
358}
359
360function reset_linux {
361	reset_linux_pci
362	for mount in $(linux_hugetlbfs_mounts); do
363		rm -f "$mount"/spdk*map_*
364	done
365	rm -f /run/.spdk*
366}
367
368function status_linux {
369	echo "Hugepages"
370	printf "%-6s %10s %8s / %6s\n" "node" "hugesize"  "free" "total"
371
372	numa_nodes=0
373	shopt -s nullglob
374	for path in /sys/devices/system/node/node?/hugepages/hugepages-*/; do
375		numa_nodes=$((numa_nodes + 1))
376		free_pages=`cat $path/free_hugepages`
377		all_pages=`cat $path/nr_hugepages`
378
379		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
380
381		node=${BASH_REMATCH[1]}
382		huge_size=${BASH_REMATCH[2]}
383
384		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
385	done
386	shopt -u nullglob
387
388	# fall back to system-wide hugepages
389	if [ "$numa_nodes" = "0" ]; then
390		free_pages=`grep HugePages_Free /proc/meminfo | awk '{ print $2 }'`
391		all_pages=`grep HugePages_Total /proc/meminfo | awk '{ print $2 }'`
392		node="-"
393		huge_size="$HUGEPGSZ"
394
395		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
396	fi
397
398	echo "NVMe devices"
399
400	echo -e "BDF\t\tNuma Node\tDriver name\t\tDevice name"
401	for bdf in $(iter_pci_class_code 01 08 02); do
402		driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
403		node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
404		if [ "$driver" = "nvme" -a -d /sys/bus/pci/devices/$bdf/nvme ]; then
405			name="\t"`ls /sys/bus/pci/devices/$bdf/nvme`;
406		else
407			name="-";
408		fi
409		echo -e "$bdf\t$node\t\t$driver\t\t$name";
410	done
411
412	echo "I/OAT DMA"
413
414	#collect all the device_id info of ioat devices.
415	TMP=`grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
416	| awk -F"x" '{print $2}'`
417	echo -e "BDF\t\tNuma Node\tDriver Name"
418	for dev_id in $TMP; do
419		for bdf in $(iter_pci_dev_id 8086 $dev_id); do
420			driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
421			node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
422			echo -e "$bdf\t$node\t\t$driver"
423		done
424	done
425
426	echo "virtio"
427
428	#collect all the device_id info of virtio devices.
429	TMP=`grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
430	| awk -F"x" '{print $2}'`
431	echo -e "BDF\t\tNuma Node\tDriver Name\t\tDevice Name"
432	for dev_id in $TMP; do
433		for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
434			driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
435			node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
436			blknames=''
437			get_virtio_names_from_bdf "$bdf" blknames
438			echo -e "$bdf\t$node\t\t$driver\t\t$blknames"
439		done
440	done
441}
442
443function configure_freebsd_pci {
444	TMP=`mktemp`
445
446	# NVMe
447	GREP_STR="class=0x010802"
448
449	# IOAT
450	grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
451	| awk -F"x" '{print $2}' > $TMP
452	for dev_id in `cat $TMP`; do
453		GREP_STR="${GREP_STR}\|chip=0x${dev_id}8086"
454	done
455
456	AWK_PROG="{if (count > 0) printf \",\"; printf \"%s:%s:%s\",\$2,\$3,\$4; count++}"
457	echo $AWK_PROG > $TMP
458
459	BDFS=`pciconf -l | grep "${GREP_STR}" | awk -F: -f $TMP`
460
461	kldunload nic_uio.ko || true
462	kenv hw.nic_uio.bdfs=$BDFS
463	kldload nic_uio.ko
464	rm $TMP
465}
466
467function configure_freebsd {
468	configure_freebsd_pci
469	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
470	#  previous value, unload contigmem so that we can reload with the new value.
471	if kldstat -q -m contigmem; then
472		if [ `kenv hw.contigmem.num_buffers` -ne "$((HUGEMEM / 256))" ]; then
473			kldunload contigmem.ko
474		fi
475	fi
476	if ! kldstat -q -m contigmem; then
477		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
478		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
479		kldload contigmem.ko
480	fi
481}
482
483function reset_freebsd {
484	kldunload contigmem.ko || true
485	kldunload nic_uio.ko || true
486}
487
488mode=$1
489
490if [ -z "$mode" ]; then
491	mode="config"
492fi
493
494: ${HUGEMEM:=2048}
495: ${PCI_WHITELIST:=""}
496
497if [ -n "$NVME_WHITELIST" ]; then
498	PCI_WHITELIST="$PCI_WHITELIST $NVME_WHITELIST"
499fi
500
501if [ -n "$SKIP_PCI" ]; then
502	PCI_WHITELIST="none"
503fi
504
505declare -a PCI_WHITELIST=(${PCI_WHITELIST})
506
507if [ -z "$TARGET_USER" ]; then
508	TARGET_USER="$SUDO_USER"
509	if [ -z "$TARGET_USER" ]; then
510		TARGET_USER=`logname 2>/dev/null` || true
511	fi
512fi
513
514if [ `uname` = Linux ]; then
515	HUGEPGSZ=$(( `grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9'` ))
516	HUGEPGSZ_MB=$(( $HUGEPGSZ / 1024 ))
517	: ${NRHUGE=$(( (HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB ))}
518
519	if [ "$mode" == "config" ]; then
520		configure_linux
521	elif [ "$mode" == "reset" ]; then
522		reset_linux
523	elif [ "$mode" == "status" ]; then
524		status_linux
525	elif [ "$mode" == "help" ]; then
526		usage $0
527	else
528		usage $0 "Invalid argument '$mode'"
529	fi
530else
531	if [ "$mode" == "config" ]; then
532		configure_freebsd
533	elif [ "$mode" == "reset" ]; then
534		reset_freebsd
535	elif [ "$mode" == "help" ]; then
536		usage $0
537	else
538		usage $0 "Invalid argument '$mode'"
539	fi
540fi
541