xref: /spdk/scripts/setup.sh (revision ceea3088870a3919d6bdfe61d7adba11b9733fb7)
1#!/usr/bin/env bash
2
3set -e
4
5os=$(uname -s)
6
7if [[ $os != Linux && $os != FreeBSD ]]; then
8	echo "Not supported platform ($os), aborting"
9	exit 1
10fi
11
12rootdir=$(readlink -f $(dirname $0))/..
13source "$rootdir/scripts/common.sh"
14
15function usage() {
16	if [[ $os == Linux ]]; then
17		options="[config|reset|status|cleanup|help]"
18	else
19		options="[config|reset|help]"
20	fi
21
22	[[ -n $2 ]] && (
23		echo "$2"
24		echo ""
25	)
26	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
27	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
28	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
29	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
30	echo "Usage: $(basename $1) $options"
31	echo
32	echo "$options - as following:"
33	echo "config            Default mode. Allocate hugepages and bind PCI devices."
34	if [[ $os == Linux ]]; then
35		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
36	fi
37	echo "reset             Rebind PCI devices back to their original drivers."
38	echo "                  Also cleanup any leftover spdk files/resources."
39	echo "                  Hugepage memory size will remain unchanged."
40	if [[ $os == Linux ]]; then
41		echo "status            Print status of all SPDK-compatible devices on the system."
42	fi
43	echo "help              Print this help message."
44	echo
45	echo "The following environment variables can be specified."
46	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
47	echo "                  For NUMA systems, the hugepages will be evenly distributed"
48	echo "                  between CPU nodes"
49	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
50	echo "HUGENODE          Specific NUMA node to allocate hugepages on. To allocate"
51	echo "                  hugepages on multiple nodes run this script multiple times -"
52	echo "                  once for each node."
53	echo "PCI_WHITELIST"
54	echo "PCI_BLACKLIST     Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
55	echo "                  Each device must be specified as a full PCI address."
56	echo "                  E.g. PCI_WHITELIST=\"0000:01:00.0 0000:02:00.0\""
57	echo "                  To blacklist all PCI devices use a non-valid address."
58	echo "                  E.g. PCI_WHITELIST=\"none\""
59	echo "                  If PCI_WHITELIST and PCI_BLACKLIST are empty or unset, all PCI devices"
60	echo "                  will be bound."
61	echo "                  Each device in PCI_BLACKLIST will be ignored (driver won't be changed)."
62	echo "                  PCI_BLACKLIST has precedence over PCI_WHITELIST."
63	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
64	echo "                  By default the current user will be used."
65	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
66	echo "                  bind devices to the given driver."
67	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
68	echo "PCI_BLOCK_SYNC_ON_RESET"
69	echo "                  If set in the environment, the attempt to wait for block devices associated"
70	echo "                  with given PCI device will be made upon reset"
71	exit 0
72}
73
74# In monolithic kernels the lsmod won't work. So
75# back that with a /sys/modules. We also check
76# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
77# contain needed info (like in Fedora-like OS).
78function check_for_driver() {
79	if lsmod | grep -q ${1//-/_}; then
80		return 1
81	fi
82
83	if [[ -d /sys/module/${1} || -d \
84		/sys/module/${1//-/_} || -d \
85		/sys/bus/pci/drivers/${1} || -d \
86		/sys/bus/pci/drivers/${1//-/_} ]]; then
87		return 2
88	fi
89	return 0
90}
91
92function pci_dev_echo() {
93	local bdf="$1"
94	shift
95	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
96}
97
98function linux_bind_driver() {
99	bdf="$1"
100	driver_name="$2"
101	old_driver_name=${drivers_d["$bdf"]:-no driver}
102	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
103
104	if [[ $driver_name == "$old_driver_name" ]]; then
105		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
106		return 0
107	fi
108
109	if [[ $old_driver_name != "no driver" ]]; then
110		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
111		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
112	fi
113
114	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
115
116	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
117	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
118
119	if [[ $driver_name == uio_pci_generic && -e /sys/module/igb_uio ]]; then
120		# Check if the uio_pci_generic driver is broken as it might be in
121		# some 4.18.x kernels (see centos8 for instance) - if our device
122		# didn't get a proper uio entry, fallback to igb_uio
123		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
124			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
125			drivers_d["$bdf"]="no driver"
126			# This call will override $driver_name for remaining devices as well
127			linux_bind_driver "$bdf" igb_uio
128		fi
129	fi
130
131	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
132	if [ -e "/dev/vfio/$iommu_group" ]; then
133		if [ -n "$TARGET_USER" ]; then
134			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
135		fi
136	fi
137}
138
139function linux_unbind_driver() {
140	local bdf="$1"
141	local ven_dev_id
142	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
143	local old_driver_name=${drivers_d["$bdf"]:-no driver}
144
145	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
146		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
147		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
148	fi
149
150	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
151}
152
153function linux_hugetlbfs_mounts() {
154	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
155}
156
157function get_block_dev_from_bdf() {
158	local bdf=$1
159	local block
160
161	for block in /sys/block/*; do
162		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
163			echo "${block##*/}"
164			return 0
165		fi
166	done
167}
168
169function get_mounted_part_dev_from_bdf_block() {
170	local bdf=$1
171	local blocks block part
172
173	blocks=($(get_block_dev_from_bdf "$bdf"))
174
175	for block in "${blocks[@]}"; do
176		for part in "/sys/block/$block/$block"*; do
177			[[ -b /dev/${part##*/} ]] || continue
178			if [[ $(< /proc/self/mountinfo) == *" $(< "$part/dev") "* ]]; then
179				echo "${part##*/}"
180			fi
181		done
182	done
183}
184
185function collect_devices() {
186	# NVMe, IOAT, IDXD, VIRTIO, VMD
187
188	local ids dev_type dev_id bdf bdfs in_use driver
189
190	ids+="PCI_DEVICE_ID_INTEL_IOAT"
191	ids+="|PCI_DEVICE_ID_INTEL_IDXD"
192	ids+="|PCI_DEVICE_ID_VIRTIO"
193	ids+="|PCI_DEVICE_ID_INTEL_VMD"
194	ids+="|SPDK_PCI_CLASS_NVME"
195
196	local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d
197
198	while read -r _ dev_type dev_id; do
199		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
200		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
201		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
202		[[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
203		for bdf in "${bdfs[@]}"; do
204			in_use=0
205			if [[ $1 != status ]]; then
206				if ! pci_can_use "$bdf"; then
207					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
208					in_use=1
209				fi
210				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
211					if ! verify_bdf_mounts "$bdf"; then
212						in_use=1
213					fi
214				fi
215				if [[ $dev_type == vmd ]]; then
216					if [[ $PCI_WHITELIST != *"$bdf"* ]]; then
217						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
218						in_use=1
219					fi
220				fi
221			fi
222			eval "${dev_type}_d[$bdf]=$in_use"
223			all_devices_d["$bdf"]=$in_use
224			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
225				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
226				drivers_d["$bdf"]=${driver##*/}
227			fi
228		done
229	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
230}
231
232function collect_driver() {
233	local bdf=$1
234	local override_driver=$2
235	local drivers driver
236
237	[[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1
238	if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
239		# Pick first entry in case multiple aliases are bound to a driver.
240		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
241		driver=${driver##*/}
242	else
243		driver=$override_driver
244	fi 2> /dev/null
245	echo "$driver"
246}
247
248function verify_bdf_mounts() {
249	local bdf=$1
250	local blknames=($(get_mounted_part_dev_from_bdf_block "$bdf"))
251
252	if ((${#blknames[@]} > 0)); then
253		for name in "${blknames[@]}"; do
254			pci_dev_echo "$bdf" "Active mountpoints on /dev/$name, so not binding PCI dev"
255		done
256		return 1
257	fi
258}
259
260function configure_linux_pci() {
261	local driver_path=""
262	driver_name=""
263	igb_uio_fallback=""
264
265	# igb_uio is a common driver to override with and it depends on uio.
266	modprobe uio
267	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
268		igb_uio_fallback=$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko
269		insmod "$igb_uio_fallback" || true
270	fi
271
272	if [[ -n "${DRIVER_OVERRIDE}" ]]; then
273		driver_path="$DRIVER_OVERRIDE"
274		driver_name="${DRIVER_OVERRIDE##*/}"
275		# modprobe and the sysfs don't use the .ko suffix.
276		driver_name=${driver_name%.ko}
277		# path = name -> there is no path
278		if [[ "$driver_path" = "$driver_name" ]]; then
279			driver_path=""
280		fi
281	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
282	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
283	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
284		driver_name=vfio-pci
285	elif modinfo uio_pci_generic > /dev/null 2>&1; then
286		driver_name=uio_pci_generic
287	elif [[ -e $igb_uio_fallback ]]; then
288		driver_name="igb_uio"
289		echo "WARNING: uio_pci_generic not detected - using $driver_name"
290	else
291		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please either enable the vfio-pci or uio_pci_generic"
292		echo "kernel modules, or have SPDK build the igb_uio driver by running ./configure --with-igb-uio-driver and recompiling."
293		return 1
294	fi
295
296	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
297	if [[ -n "$driver_path" ]]; then
298		insmod $driver_path || true
299	else
300		modprobe $driver_name
301	fi
302
303	for bdf in "${!all_devices_d[@]}"; do
304		if ((all_devices_d["$bdf"] == 0)); then
305			if [[ -n ${nvme_d["$bdf"]} ]]; then
306				# Some nvme controllers may take significant amount of time while being
307				# unbound from the driver. Put that task into background to speed up the
308				# whole process. Currently this is done only for the devices bound to the
309				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
310				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
311				linux_bind_driver "$bdf" "$driver_name" &
312			else
313				linux_bind_driver "$bdf" "$driver_name"
314			fi
315		fi
316	done
317	wait
318
319	echo "1" > "/sys/bus/pci/rescan"
320}
321
322function cleanup_linux() {
323	shopt -s extglob nullglob
324	dirs_to_clean=""
325	dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) "
326	if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then
327		dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) "
328	fi
329
330	files_to_clean=""
331	for dir in $dirs_to_clean; do
332		files_to_clean+="$(echo $dir/*) "
333	done
334	shopt -u extglob nullglob
335
336	files_to_clean+="$(ls -1 /dev/shm/* \
337		| grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) "
338	files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)"
339	if [[ -z "$files_to_clean" ]]; then
340		echo "Clean"
341		return 0
342	fi
343
344	shopt -s extglob
345	for fd_dir in $(echo /proc/+([0-9])); do
346		opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)"
347	done
348	shopt -u extglob
349
350	if [[ -z "$opened_files" ]]; then
351		echo "Can't get list of opened files!"
352		exit 1
353	fi
354
355	echo 'Cleaning'
356	for f in $files_to_clean; do
357		if ! echo "$opened_files" | grep -E -q "^$f\$"; then
358			echo "Removing:    $f"
359			rm $f
360		else
361			echo "Still open: $f"
362		fi
363	done
364
365	for dir in $dirs_to_clean; do
366		if ! echo "$opened_files" | grep -E -q "^$dir\$"; then
367			echo "Removing:    $dir"
368			rmdir $dir
369		else
370			echo "Still open: $dir"
371		fi
372	done
373	echo "Clean"
374
375	unset dirs_to_clean files_to_clean opened_files
376}
377
378function configure_linux() {
379	configure_linux_pci
380	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
381
382	if [ -z "$hugetlbfs_mounts" ]; then
383		hugetlbfs_mounts=/mnt/huge
384		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
385		mkdir -p "$hugetlbfs_mounts"
386		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
387	fi
388
389	if [ -z "$HUGENODE" ]; then
390		hugepages_target="/proc/sys/vm/nr_hugepages"
391	else
392		hugepages_target="/sys/devices/system/node/node${HUGENODE}/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages"
393	fi
394
395	echo "$NRHUGE" > "$hugepages_target"
396	allocated_hugepages=$(cat $hugepages_target)
397	if [ "$allocated_hugepages" -lt "$NRHUGE" ]; then
398		echo ""
399		echo "## ERROR: requested $NRHUGE hugepages but only $allocated_hugepages could be allocated."
400		echo "## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine."
401		exit 1
402	fi
403
404	if [ "$driver_name" = "vfio-pci" ]; then
405		if [ -n "$TARGET_USER" ]; then
406			for mount in $hugetlbfs_mounts; do
407				chown "$TARGET_USER" "$mount"
408				chmod g+w "$mount"
409			done
410
411			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
412			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
413				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
414				cat <<- MEMLOCK
415					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
416
417					This is the maximum amount of memory you will be
418					able to use with DPDK and VFIO if run as user "$TARGET_USER".
419					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
420				MEMLOCK
421				if ((MEMLOCK_AMNT < 65536)); then
422					echo ""
423					echo "## WARNING: memlock limit is less than 64MB"
424					echo -n "## DPDK with VFIO may not be able to initialize "
425					echo "if run as user \"$TARGET_USER\"."
426				fi
427			fi
428		fi
429	fi
430
431	if [ ! -f /dev/cpu/0/msr ]; then
432		# Some distros build msr as a module.  Make sure it's loaded to ensure
433		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
434		#  sleeps.
435		modprobe msr || true
436	fi
437}
438
439function reset_linux_pci() {
440	# virtio
441	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
442	# Requires some more investigation - for example, some kernels do not seem to have
443	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
444	#  underscore vs. dash right in the virtio_scsi name.
445	modprobe virtio-pci || true
446	for bdf in "${!all_devices_d[@]}"; do
447		((all_devices_d["$bdf"] == 0)) || continue
448
449		[[ -n ${nvme_d["$bdf"]} ]] && fallback_driver=nvme
450		[[ -n ${ioat_d["$bdf"]} ]] && fallback_driver=ioatdma
451		[[ -n ${idxd_d["$bdf"]} ]] && fallback_driver=idxd
452		[[ -n ${virtio_d["$bdf"]} ]] && fallback_driver=virtio-pci
453		[[ -n ${vmd_d["$bdf"]} ]] && fallback_driver=vmd
454		driver=$(collect_driver "$bdf" "$fallback_driver")
455
456		if ! check_for_driver "$driver"; then
457			linux_bind_driver "$bdf" "$driver"
458		else
459			linux_unbind_driver "$bdf"
460		fi
461	done
462
463	echo "1" > "/sys/bus/pci/rescan"
464}
465
466function reset_linux() {
467	reset_linux_pci
468	for mount in $(linux_hugetlbfs_mounts); do
469		rm -f "$mount"/spdk*map_*
470	done
471	rm -f /run/.spdk*
472}
473
474function status_linux() {
475	echo "Hugepages"
476	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total"
477
478	numa_nodes=0
479	shopt -s nullglob
480	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
481		numa_nodes=$((numa_nodes + 1))
482		free_pages=$(cat $path/free_hugepages)
483		all_pages=$(cat $path/nr_hugepages)
484
485		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
486
487		node=${BASH_REMATCH[1]}
488		huge_size=${BASH_REMATCH[2]}
489
490		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
491	done
492	shopt -u nullglob
493
494	# fall back to system-wide hugepages
495	if [ "$numa_nodes" = "0" ]; then
496		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
497		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
498		node="-"
499		huge_size="$HUGEPGSZ"
500
501		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
502	fi
503
504	echo -e "\nBDF\t\tVendor\tDevice\tNUMA\tDriver\t\tDevice name\n"
505	echo "NVMe devices"
506
507	for bdf in "${!nvme_d[@]}"; do
508		driver=${drivers_d["$bdf"]}
509		if [ "$numa_nodes" = "0" ]; then
510			node="-"
511		else
512			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
513			if ((node == -1)); then
514				node=unknown
515			fi
516		fi
517		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
518			name="\t"$(ls /sys/bus/pci/devices/$bdf/nvme)
519		else
520			name="-"
521		fi
522		echo -e "$bdf\t${pci_ids_vendor["$bdf"]#0x}\t${pci_ids_device["$bdf"]#0x}\t$node\t${driver:--}\t\t$name"
523	done
524
525	echo ""
526	echo "I/OAT Engine"
527
528	for bdf in "${!ioat_d[@]}"; do
529		driver=${drivers_d["$bdf"]}
530		if [ "$numa_nodes" = "0" ]; then
531			node="-"
532		else
533			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
534			if ((node == -1)); then
535				node=unknown
536			fi
537		fi
538		echo -e "$bdf\t${pci_ids_vendor["$bdf"]#0x}\t${pci_ids_device["$bdf"]#0x}\t$node\t${driver:--}"
539	done
540
541	echo ""
542	echo "IDXD Engine"
543
544	for bdf in "${!idxd_d[@]}"; do
545		driver=${drivers_d["$bdf"]}
546		if [ "$numa_nodes" = "0" ]; then
547			node="-"
548		else
549			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
550		fi
551		echo -e "$bdf\t${pci_ids_vendor["$bdf"]#0x}\t${pci_ids_device["$bdf"]#0x}\t$node\t${driver:--}"
552	done
553
554	echo ""
555	echo "virtio"
556
557	for bdf in "${!virtio_d[@]}"; do
558		driver=${drivers_d["$bdf"]}
559		if [ "$numa_nodes" = "0" ]; then
560			node="-"
561		else
562			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
563			if ((node == -1)); then
564				node=unknown
565			fi
566		fi
567		blknames=($(get_mounted_part_dev_from_bdf_block "$bdf"))
568		echo -e "$bdf\t${pci_ids_vendor["$bdf"]#0x}\t${pci_ids_device["$bdf"]#0x}\t$node\t\t${driver:--}\t\t" "${blknames[@]}"
569	done
570
571	echo ""
572	echo "VMD"
573
574	for bdf in "${!vmd_d[@]}"; do
575		driver=${drivers_d["$bdf"]}
576		node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
577		if ((node == -1)); then
578			node=unknown
579		fi
580		echo -e "$bdf\t$node\t\t$driver"
581	done
582}
583
584function status_freebsd() {
585	local pci
586
587	status_print() (
588		local dev driver
589
590		echo -e "BDF\t\tVendor\tDevice\tDriver"
591
592		for pci; do
593			driver=$(pciconf -l "pci$pci")
594			driver=${driver%@*}
595			printf '%s\t%s\t%s\t%s\n' \
596				"$pci" \
597				"${pci_ids_vendor["$pci"]}" \
598				"${pci_ids_device["$pci"]}" \
599				"$driver"
600		done
601	)
602
603	local contigmem=present
604	if ! kldstat -q -m contigmem; then
605		contigmem="not present"
606	fi
607
608	cat <<- BSD_INFO
609		Contigmem ($contigmem)
610		Buffer Size: $(kenv hw.contigmem.buffer_size)
611		Num Buffers: $(kenv hw.contigmem.num_buffers)
612
613		NVMe devices
614		$(status_print "${!nvme_d[@]}")
615
616		I/IOAT DMA
617		$(status_print "${!ioat_d[@]}")
618
619		IDXD DMA
620		$(status_print "${!idxd_d[@]}")
621
622		VMD
623		$(status_print "${!vmd_d[@]}")
624	BSD_INFO
625}
626
627function configure_freebsd_pci() {
628	local BDFS
629
630	BDFS+=("${!nvme_d[@]}")
631	BDFS+=("${!ioat_d[@]}")
632	BDFS+=("${!idxd_d[@]}")
633	BDFS+=("${!vmd_d[@]}")
634
635	# Drop the domain part from all the addresses
636	BDFS=("${BDFS[@]#*:}")
637
638	local IFS=","
639	kldunload nic_uio.ko || true
640	kenv hw.nic_uio.bdfs="${BDFS[*]}"
641	kldload nic_uio.ko
642}
643
644function configure_freebsd() {
645	configure_freebsd_pci
646	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
647	#  previous value, unload contigmem so that we can reload with the new value.
648	if kldstat -q -m contigmem; then
649		if [ $(kenv hw.contigmem.num_buffers) -ne "$((HUGEMEM / 256))" ]; then
650			kldunload contigmem.ko
651		fi
652	fi
653	if ! kldstat -q -m contigmem; then
654		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
655		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
656		kldload contigmem.ko
657	fi
658}
659
660function reset_freebsd() {
661	kldunload contigmem.ko || true
662	kldunload nic_uio.ko || true
663}
664
665CMD=reset cache_pci_bus
666
667mode=$1
668
669if [ -z "$mode" ]; then
670	mode="config"
671fi
672
673: ${HUGEMEM:=2048}
674: ${PCI_WHITELIST:=""}
675: ${PCI_BLACKLIST:=""}
676
677if [ -n "$NVME_WHITELIST" ]; then
678	PCI_WHITELIST="$PCI_WHITELIST $NVME_WHITELIST"
679fi
680
681if [ -n "$SKIP_PCI" ]; then
682	PCI_WHITELIST="none"
683fi
684
685if [ -z "$TARGET_USER" ]; then
686	TARGET_USER="$SUDO_USER"
687	if [ -z "$TARGET_USER" ]; then
688		TARGET_USER=$(logname 2> /dev/null) || true
689	fi
690fi
691
692collect_devices "$mode"
693
694if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
695	# Note that this will wait only for the first block device attached to
696	# a given storage controller. For nvme this may miss some of the devs
697	# in case multiple namespaces are being in place.
698	# FIXME: Wait for nvme controller(s) to be in live state and determine
699	# number of configured namespaces, build list of potential block devs
700	# and pass them to sync_dev_uevents. Is it worth the effort?
701	bdfs_to_wait_for=()
702	for bdf in "${!all_devices_d[@]}"; do
703		((all_devices_d["$bdf"] == 0)) || continue
704		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
705			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
706			bdfs_to_wait_for+=("$bdf")
707		fi
708	done
709	if ((${#bdfs_to_wait_for[@]} > 0)); then
710		echo "Waiting for block devices as requested"
711		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
712		"$rootdir/scripts/sync_dev_uevents.sh" \
713			block/disk \
714			"${bdfs_to_wait_for[@]}" &
715		sync_pid=$!
716	fi
717fi
718
719if [[ $os == Linux ]]; then
720	HUGEPGSZ=$(($(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')))
721	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
722	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
723
724	if [ "$mode" == "config" ]; then
725		configure_linux
726	elif [ "$mode" == "cleanup" ]; then
727		cleanup_linux
728	elif [ "$mode" == "reset" ]; then
729		reset_linux
730	elif [ "$mode" == "status" ]; then
731		status_linux
732	elif [ "$mode" == "help" ]; then
733		usage $0
734	else
735		usage $0 "Invalid argument '$mode'"
736	fi
737else
738	if [ "$mode" == "config" ]; then
739		configure_freebsd
740	elif [ "$mode" == "reset" ]; then
741		reset_freebsd
742	elif [ "$mode" == "cleanup" ]; then
743		echo "setup.sh cleanup function not yet supported on $os"
744	elif [ "$mode" == "status" ]; then
745		status_freebsd
746	elif [ "$mode" == "help" ]; then
747		usage $0
748	else
749		usage $0 "Invalid argument '$mode'"
750	fi
751fi
752
753if [[ -e /proc/$sync_pid/status ]]; then
754	wait "$sync_pid"
755fi
756