xref: /spdk/scripts/setup.sh (revision 5fd9561f54daa8eff7f3bcb56c789655bca846b1)
1#!/usr/bin/env bash
2
3set -e
4
5os=$(uname -s)
6
7if [[ $os != Linux && $os != FreeBSD ]]; then
8	echo "Not supported platform ($os), aborting"
9	exit 1
10fi
11
12rootdir=$(readlink -f $(dirname $0))/..
13source "$rootdir/scripts/common.sh"
14
15function usage() {
16	if [[ $os == Linux ]]; then
17		options="[config|reset|status|cleanup|help]"
18	else
19		options="[config|reset|help]"
20	fi
21
22	[[ -n $2 ]] && (
23		echo "$2"
24		echo ""
25	)
26	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
27	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
28	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
29	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
30	echo "Usage: $(basename $1) $options"
31	echo
32	echo "$options - as following:"
33	echo "config            Default mode. Allocate hugepages and bind PCI devices."
34	if [[ $os == Linux ]]; then
35		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
36	fi
37	echo "reset             Rebind PCI devices back to their original drivers."
38	echo "                  Also cleanup any leftover spdk files/resources."
39	echo "                  Hugepage memory size will remain unchanged."
40	if [[ $os == Linux ]]; then
41		echo "status            Print status of all SPDK-compatible devices on the system."
42	fi
43	echo "help              Print this help message."
44	echo
45	echo "The following environment variables can be specified."
46	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
47	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
48	echo "                  default."
49	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
50	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
51	echo "                  Uses kernel's default for hugepages size."
52	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
53	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
54	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
55	echo "                  Hugepages can be defined per node with e.g.:"
56	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
57	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
58	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
59	echo "                  setting is used."
60	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
61	echo "                  be made prior to allocation".
62	echo "PCI_ALLOWED"
63	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
64	echo "                  Each device must be specified as a full PCI address."
65	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
66	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
67	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
68	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
69	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
70	echo "                  will be bound."
71	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
72	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
73	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
74	echo "                  By default the current user will be used."
75	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
76	echo "                  bind devices to the given driver."
77	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
78	echo "PCI_BLOCK_SYNC_ON_RESET"
79	echo "                  If set in the environment, the attempt to wait for block devices associated"
80	echo "                  with given PCI device will be made upon reset"
81	exit 0
82}
83
84# In monolithic kernels the lsmod won't work. So
85# back that with a /sys/modules. We also check
86# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
87# contain needed info (like in Fedora-like OS).
88function check_for_driver() {
89	if [[ -z $1 ]]; then
90		return 0
91	fi
92
93	if lsmod | grep -q ${1//-/_}; then
94		return 1
95	fi
96
97	if [[ -d /sys/module/${1} || -d \
98		/sys/module/${1//-/_} || -d \
99		/sys/bus/pci/drivers/${1} || -d \
100		/sys/bus/pci/drivers/${1//-/_} ]]; then
101		return 2
102	fi
103	return 0
104}
105
106function check_for_driver_freebsd() {
107	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
108	local search_paths path driver
109	IFS=";" read -ra search_paths < <(kldconfig -rU)
110
111	for driver in contigmem.ko nic_uio.ko; do
112		for path in "${search_paths[@]}"; do
113			[[ -f $path/$driver ]] && continue 2
114		done
115		return 1
116	done
117	return 0
118}
119
120function pci_dev_echo() {
121	local bdf="$1"
122	shift
123	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
124}
125
126function linux_bind_driver() {
127	bdf="$1"
128	driver_name="$2"
129	old_driver_name=${drivers_d["$bdf"]:-no driver}
130	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
131
132	if [[ $driver_name == "$old_driver_name" ]]; then
133		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
134		return 0
135	fi
136
137	if [[ $old_driver_name != "no driver" ]]; then
138		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
139		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
140	fi
141
142	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
143
144	if [[ $driver_name == "none" ]]; then
145		return 0
146	fi
147
148	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
149	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
150
151	if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then
152		# Check if the uio_pci_generic driver is broken as it might be in
153		# some 4.18.x kernels (see centos8 for instance) - if our device
154		# didn't get a proper uio entry, fallback to igb_uio
155		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
156			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
157			drivers_d["$bdf"]="no driver"
158			# This call will override $driver_name for remaining devices as well
159			linux_bind_driver "$bdf" igb_uio
160		fi
161	fi
162
163	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
164	if [ -e "/dev/vfio/$iommu_group" ]; then
165		if [ -n "$TARGET_USER" ]; then
166			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
167		fi
168	fi
169}
170
171function linux_unbind_driver() {
172	local bdf="$1"
173	local ven_dev_id
174	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
175	local old_driver_name=${drivers_d["$bdf"]:-no driver}
176
177	if [[ $old_driver_name == "no driver" ]]; then
178		pci_dev_echo "$bdf" "Not bound to any driver"
179		return 0
180	fi
181
182	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
183		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
184		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
185	fi
186
187	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
188}
189
190function linux_hugetlbfs_mounts() {
191	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
192}
193
194function get_block_dev_from_bdf() {
195	local bdf=$1
196	local block
197
198	for block in /sys/block/*; do
199		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
200			echo "${block##*/}"
201		fi
202	done
203}
204
205function get_used_bdf_block_devs() {
206	local bdf=$1
207	local blocks block blockp dev mount holder
208	local used
209
210	hash lsblk || return 1
211	blocks=($(get_block_dev_from_bdf "$bdf"))
212
213	for block in "${blocks[@]}"; do
214		# Check if the device is hold by some other, regardless if it's mounted
215		# or not.
216		for holder in "/sys/class/block/$block"*/holders/*; do
217			[[ -e $holder ]] || continue
218			blockp=${holder%/holders*} blockp=${blockp##*/}
219			if [[ -e $holder/slaves/$blockp ]]; then
220				used+=("holder@$blockp:${holder##*/}")
221			fi
222		done
223		while read -r dev mount; do
224			if [[ -e $mount ]]; then
225				used+=("mount@$block:$dev")
226			fi
227		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
228		if ((${#used[@]} == 0)); then
229			# Make sure we check if there's any valid data present on the target device
230			# regardless if it's being actively used or not. This is mainly done to make
231			# sure we don't miss more complex setups like ZFS pools, etc.
232			if block_in_use "$block" > /dev/null; then
233				used+=("data@$block")
234			fi
235		fi
236	done
237
238	if ((${#used[@]} > 0)); then
239		printf '%s\n' "${used[@]}"
240	fi
241}
242
243function collect_devices() {
244	# NVMe, IOAT, IDXD, VIRTIO, VMD
245
246	local ids dev_type dev_id bdf bdfs in_use driver
247
248	ids+="PCI_DEVICE_ID_INTEL_IOAT"
249	ids+="|PCI_DEVICE_ID_INTEL_IDXD"
250	ids+="|PCI_DEVICE_ID_VIRTIO"
251	ids+="|PCI_DEVICE_ID_INTEL_VMD"
252	ids+="|SPDK_PCI_CLASS_NVME"
253
254	local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d
255
256	while read -r _ dev_type dev_id; do
257		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
258		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
259		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
260		[[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
261		for bdf in "${bdfs[@]}"; do
262			in_use=0
263			if [[ $1 != status ]]; then
264				if ! pci_can_use "$bdf"; then
265					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
266					in_use=1
267				fi
268				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
269					if ! verify_bdf_block_devs "$bdf"; then
270						in_use=1
271					fi
272				fi
273				if [[ $dev_type == vmd ]]; then
274					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
275						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
276						in_use=1
277					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
278						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
279							if [ "$mode" == "config" ]; then
280								cat <<- MESSAGE
281									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
282									which are attached to the kernel NVMe driver,the binding process may go faster
283									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
284									NVMe SSDs, and then run again to unbind the VMD devices."
285								MESSAGE
286							fi
287						fi
288					fi
289				fi
290			fi
291			eval "${dev_type}_d[$bdf]=$in_use"
292			all_devices_d["$bdf"]=$in_use
293			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
294				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
295				drivers_d["$bdf"]=${driver##*/}
296			fi
297		done
298	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
299}
300
301function collect_driver() {
302	local bdf=$1
303	local drivers driver
304
305	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
306		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
307		# Pick first entry in case multiple aliases are bound to a driver.
308		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
309		driver=${driver##*/}
310	else
311		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
312		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
313		[[ -n ${idxd_d["$bdf"]} ]] && driver=idxd
314		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
315		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
316	fi 2> /dev/null
317	echo "$driver"
318}
319
320function verify_bdf_block_devs() {
321	local bdf=$1
322	local blknames
323	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
324
325	if ((${#blknames[@]} > 0)); then
326		local IFS=","
327		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
328		return 1
329	fi
330}
331
332function configure_linux_pci() {
333	local driver_path=""
334	driver_name=""
335	igb_uio_fallback=""
336
337	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
338		# igb_uio is a common driver to override with and it depends on uio.
339		modprobe uio || true
340		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
341			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
342		fi
343	fi
344
345	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
346		driver_name=none
347	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
348		driver_path="$DRIVER_OVERRIDE"
349		driver_name="${DRIVER_OVERRIDE##*/}"
350		# modprobe and the sysfs don't use the .ko suffix.
351		driver_name=${driver_name%.ko}
352		# path = name -> there is no path
353		if [[ "$driver_path" = "$driver_name" ]]; then
354			driver_path=""
355		fi
356	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
357	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
358	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
359		driver_name=vfio-pci
360		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
361		# should be done automatically by modprobe since this particular module should
362		# be a part of vfio-pci dependencies, however, on some distros, it seems that
363		# it's not the case. See #1689.
364		if modinfo vfio_iommu_type1 > /dev/null; then
365			modprobe vfio_iommu_type1
366		fi
367	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
368		driver_name=uio_pci_generic
369	elif [[ -e $igb_uio_fallback ]]; then
370		driver_path="$igb_uio_fallback"
371		driver_name="igb_uio"
372		echo "WARNING: uio_pci_generic not detected - using $driver_name"
373	else
374		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
375		return 1
376	fi
377
378	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
379	if [[ $driver_name != "none" ]]; then
380		if [[ -n "$driver_path" ]]; then
381			insmod $driver_path || true
382		else
383			modprobe $driver_name
384		fi
385	fi
386
387	for bdf in "${!all_devices_d[@]}"; do
388		if ((all_devices_d["$bdf"] == 0)); then
389			if [[ -n ${nvme_d["$bdf"]} ]]; then
390				# Some nvme controllers may take significant amount of time while being
391				# unbound from the driver. Put that task into background to speed up the
392				# whole process. Currently this is done only for the devices bound to the
393				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
394				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
395				linux_bind_driver "$bdf" "$driver_name" &
396			else
397				linux_bind_driver "$bdf" "$driver_name"
398			fi
399		fi
400	done
401	wait
402
403	echo "1" > "/sys/bus/pci/rescan"
404}
405
406function cleanup_linux() {
407	shopt -s extglob nullglob
408	dirs_to_clean=""
409	dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) "
410	if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then
411		dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) "
412	fi
413
414	files_to_clean="" file_locks=()
415	for dir in $dirs_to_clean; do
416		files_to_clean+="$(echo $dir/*) "
417	done
418	file_locks+=(/var/tmp/spdk_pci_lock*)
419	shopt -u extglob nullglob
420
421	files_to_clean+="$(ls -1 /dev/shm/* \
422		| grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) "
423	files_to_clean+=" ${file_locks[*]}"
424	files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)"
425	if [[ -z "$files_to_clean" ]]; then
426		echo "Clean"
427		return 0
428	fi
429
430	shopt -s extglob
431	for fd_dir in $(echo /proc/+([0-9])); do
432		opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)"
433	done
434	shopt -u extglob
435
436	if [[ -z "$opened_files" ]]; then
437		echo "Can't get list of opened files!"
438		exit 1
439	fi
440
441	echo 'Cleaning'
442	for f in $files_to_clean; do
443		if ! echo "$opened_files" | grep -E -q "^$f\$"; then
444			echo "Removing:    $f"
445			rm $f
446		else
447			echo "Still open: $f"
448		fi
449	done
450
451	for dir in $dirs_to_clean; do
452		if ! echo "$opened_files" | grep -E -q "^$dir\$"; then
453			echo "Removing:    $dir"
454			rmdir $dir
455		else
456			echo "Still open: $dir"
457		fi
458	done
459	echo "Clean"
460
461	unset dirs_to_clean files_to_clean opened_files
462}
463
464check_hugepages_alloc() {
465	local hp_int=$1
466	local allocated_hugepages
467
468	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
469
470	allocated_hugepages=$(< "$hp_int")
471	if ((allocated_hugepages < NRHUGE)); then
472		cat <<- ERROR
473
474			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
475			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
476		ERROR
477		return 1
478	fi
479}
480
481clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
482
483configure_linux_hugepages() {
484	local node system_nodes
485	local nodes_to_use nodes_hp
486
487	if [[ $CLEAR_HUGE == yes ]]; then
488		clear_hugepages
489	fi
490
491	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
492		clear_hugepages
493		check_hugepages_alloc /proc/sys/vm/nr_hugepages
494		return 0
495	fi
496
497	for node in /sys/devices/system/node/node*; do
498		[[ -e $node ]] || continue
499		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
500	done
501
502	if ((${#nodes[@]} == 0)); then
503		# No NUMA support? Fallback to common interface
504		check_hugepages_alloc /proc/sys/vm/nr_hugepages
505		return 0
506	fi
507
508	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
509	if ((${#nodes_to_use[@]} == 0)); then
510		nodes_to_use[0]=0
511	fi
512
513	# Align indexes with node ids
514	for node in "${!nodes_to_use[@]}"; do
515		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
516			eval "${nodes_to_use[node]}"
517		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
518			nodes_hp[nodes_to_use[node]]=$NRHUGE
519		fi
520	done
521
522	for node in "${!nodes_hp[@]}"; do
523		if [[ -z ${nodes[node]} ]]; then
524			echo "Node $node doesn't exist, ignoring" >&2
525			continue
526		fi
527		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
528	done
529}
530
531function configure_linux() {
532	configure_linux_pci
533	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
534
535	if [ -z "$hugetlbfs_mounts" ]; then
536		hugetlbfs_mounts=/mnt/huge
537		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
538		mkdir -p "$hugetlbfs_mounts"
539		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
540	fi
541
542	configure_linux_hugepages
543
544	if [ "$driver_name" = "vfio-pci" ]; then
545		if [ -n "$TARGET_USER" ]; then
546			for mount in $hugetlbfs_mounts; do
547				chown "$TARGET_USER" "$mount"
548				chmod g+w "$mount"
549			done
550
551			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
552			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
553				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
554				cat <<- MEMLOCK
555					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
556
557					This is the maximum amount of memory you will be
558					able to use with DPDK and VFIO if run as user "$TARGET_USER".
559					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
560				MEMLOCK
561				if ((MEMLOCK_AMNT < 65536)); then
562					echo ""
563					echo "## WARNING: memlock limit is less than 64MB"
564					echo -n "## DPDK with VFIO may not be able to initialize "
565					echo "if run as user \"$TARGET_USER\"."
566				fi
567			fi
568		fi
569	fi
570
571	if [ ! -e /dev/cpu/0/msr ]; then
572		# Some distros build msr as a module.  Make sure it's loaded to ensure
573		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
574		#  sleeps.
575		modprobe msr || true
576	fi
577}
578
579function reset_linux_pci() {
580	# virtio
581	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
582	# Requires some more investigation - for example, some kernels do not seem to have
583	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
584	#  underscore vs. dash right in the virtio_scsi name.
585	modprobe virtio-pci || true
586	for bdf in "${!all_devices_d[@]}"; do
587		((all_devices_d["$bdf"] == 0)) || continue
588
589		driver=$(collect_driver "$bdf")
590		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
591			linux_bind_driver "$bdf" "$driver"
592		else
593			linux_unbind_driver "$bdf"
594		fi
595	done
596
597	echo "1" > "/sys/bus/pci/rescan"
598}
599
600function reset_linux() {
601	reset_linux_pci
602	for mount in $(linux_hugetlbfs_mounts); do
603		for hp in "$mount"/spdk*map_*; do
604			flock -n "$hp" true && rm -f "$hp"
605		done
606	done
607	rm -f /run/.spdk*
608}
609
610function status_linux() {
611	echo "Hugepages" >&2
612	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
613
614	numa_nodes=0
615	shopt -s nullglob
616	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
617		numa_nodes=$((numa_nodes + 1))
618		free_pages=$(cat $path/free_hugepages)
619		all_pages=$(cat $path/nr_hugepages)
620
621		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
622
623		node=${BASH_REMATCH[1]}
624		huge_size=${BASH_REMATCH[2]}
625
626		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
627	done
628	shopt -u nullglob
629
630	# fall back to system-wide hugepages
631	if [ "$numa_nodes" = "0" ]; then
632		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
633		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
634		node="-"
635		huge_size="$HUGEPGSZ"
636
637		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
638	fi
639
640	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
641		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
642
643	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
644
645	for bdf in "${sorted_bdfs[@]}"; do
646		driver=${drivers_d["$bdf"]}
647		if [ "$numa_nodes" = "0" ]; then
648			node="-"
649		else
650			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
651			if ((node == -1)); then
652				node=unknown
653			fi
654		fi
655		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
656			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
657		else
658			name="-"
659		fi
660
661		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
662			blknames=($(get_block_dev_from_bdf "$bdf"))
663		else
664			blknames=("-")
665		fi
666
667		desc=""
668		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
669		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
670		desc=${desc:-${idxd_d["$bdf"]:+IDXD}}
671		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
672		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
673
674		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
675			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
676			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
677	done
678}
679
680function status_freebsd() {
681	local pci
682
683	status_print() (
684		local dev driver
685
686		echo -e "BDF\t\tVendor\tDevice\tDriver"
687
688		for pci; do
689			driver=$(pciconf -l "pci$pci")
690			driver=${driver%@*}
691			printf '%s\t%s\t%s\t%s\n' \
692				"$pci" \
693				"${pci_ids_vendor["$pci"]}" \
694				"${pci_ids_device["$pci"]}" \
695				"$driver"
696		done | sort -k1,1
697	)
698
699	local contigmem=present
700	local contigmem_buffer_size
701	local contigmem_num_buffers
702
703	if ! kldstat -q -m contigmem; then
704		contigmem="not present"
705	fi
706	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
707		contigmem_buffer_size="not set"
708	fi
709	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
710		contigmem_num_buffers="not set"
711	fi
712
713	cat <<- BSD_INFO
714		Contigmem ($contigmem)
715		Buffer Size: $contigmem_buffer_size
716		Num Buffers: $contigmem_num_buffers
717
718		NVMe devices
719		$(status_print "${!nvme_d[@]}")
720
721		I/IOAT DMA
722		$(status_print "${!ioat_d[@]}")
723
724		IDXD DMA
725		$(status_print "${!idxd_d[@]}")
726
727		VMD
728		$(status_print "${!vmd_d[@]}")
729	BSD_INFO
730}
731
732function configure_freebsd_pci() {
733	local BDFS
734
735	BDFS+=("${!nvme_d[@]}")
736	BDFS+=("${!ioat_d[@]}")
737	BDFS+=("${!idxd_d[@]}")
738	BDFS+=("${!vmd_d[@]}")
739
740	# Drop the domain part from all the addresses
741	BDFS=("${BDFS[@]#*:}")
742
743	local IFS=","
744	kldunload nic_uio.ko || true
745	kenv hw.nic_uio.bdfs="${BDFS[*]}"
746	kldload nic_uio.ko
747}
748
749function configure_freebsd() {
750	if ! check_for_driver_freebsd; then
751		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
752		return 1
753	fi
754	configure_freebsd_pci
755	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
756	#  previous value, unload contigmem so that we can reload with the new value.
757	if kldstat -q -m contigmem; then
758		# contigmem may be loaded, but the kernel environment doesn't have to
759		# be necessarily set at this point. If it isn't, kenv will fail to
760		# pick up the hw. options. Handle it.
761		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
762			contigmem_num_buffers=-1
763		fi 2> /dev/null
764		if ((contigmem_num_buffers != HUGEMEM / 256)); then
765			kldunload contigmem.ko
766		fi
767	fi
768	if ! kldstat -q -m contigmem; then
769		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
770		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
771		kldload contigmem.ko
772	fi
773}
774
775function reset_freebsd() {
776	kldunload contigmem.ko || true
777	kldunload nic_uio.ko || true
778}
779
780CMD=reset cache_pci_bus
781
782mode=$1
783
784if [ -z "$mode" ]; then
785	mode="config"
786fi
787
788: ${HUGEMEM:=2048}
789: ${PCI_ALLOWED:=""}
790: ${PCI_BLOCKED:=""}
791
792if [ -n "$NVME_ALLOWED" ]; then
793	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
794fi
795
796if [ -n "$SKIP_PCI" ]; then
797	PCI_ALLOWED="none"
798fi
799
800if [ -z "$TARGET_USER" ]; then
801	TARGET_USER="$SUDO_USER"
802	if [ -z "$TARGET_USER" ]; then
803		TARGET_USER=$(logname 2> /dev/null) || true
804	fi
805fi
806
807collect_devices "$mode"
808
809if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
810	# Note that this will wait only for the first block device attached to
811	# a given storage controller. For nvme this may miss some of the devs
812	# in case multiple namespaces are being in place.
813	# FIXME: Wait for nvme controller(s) to be in live state and determine
814	# number of configured namespaces, build list of potential block devs
815	# and pass them to sync_dev_uevents. Is it worth the effort?
816	bdfs_to_wait_for=()
817	for bdf in "${!all_devices_d[@]}"; do
818		((all_devices_d["$bdf"] == 0)) || continue
819		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
820			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
821			bdfs_to_wait_for+=("$bdf")
822		fi
823	done
824	if ((${#bdfs_to_wait_for[@]} > 0)); then
825		echo "Waiting for block devices as requested"
826		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
827		"$rootdir/scripts/sync_dev_uevents.sh" \
828			block/disk \
829			"${bdfs_to_wait_for[@]}" &
830		sync_pid=$!
831	fi
832fi
833
834if [[ $os == Linux ]]; then
835	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
836		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
837		unset -v HUGEPGSZ
838	fi
839
840	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
841	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
842	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
843
844	if [ "$mode" == "config" ]; then
845		configure_linux
846	elif [ "$mode" == "cleanup" ]; then
847		cleanup_linux
848		clear_hugepages
849	elif [ "$mode" == "reset" ]; then
850		reset_linux
851	elif [ "$mode" == "status" ]; then
852		status_linux
853	elif [ "$mode" == "help" ]; then
854		usage $0
855	else
856		usage $0 "Invalid argument '$mode'"
857	fi
858else
859	if [ "$mode" == "config" ]; then
860		configure_freebsd
861	elif [ "$mode" == "reset" ]; then
862		reset_freebsd
863	elif [ "$mode" == "cleanup" ]; then
864		echo "setup.sh cleanup function not yet supported on $os"
865	elif [ "$mode" == "status" ]; then
866		status_freebsd
867	elif [ "$mode" == "help" ]; then
868		usage $0
869	else
870		usage $0 "Invalid argument '$mode'"
871	fi
872fi
873
874if [[ -e /proc/$sync_pid/status ]]; then
875	wait "$sync_pid"
876fi
877