xref: /spdk/scripts/setup.sh (revision da2fd6651a9cd4732b0910d30291821e77f4d643)
1#!/usr/bin/env bash
2
3set -e
4
5os=$(uname -s)
6
7if [[ $os != Linux && $os != FreeBSD ]]; then
8	echo "Not supported platform ($os), aborting"
9	exit 1
10fi
11
12rootdir=$(readlink -f $(dirname $0))/..
13source "$rootdir/scripts/common.sh"
14
15function usage() {
16	if [[ $os == Linux ]]; then
17		options="[config|reset|status|cleanup|help]"
18	else
19		options="[config|reset|help]"
20	fi
21
22	[[ -n $2 ]] && (
23		echo "$2"
24		echo ""
25	)
26	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
27	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
28	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
29	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
30	echo "Usage: $(basename $1) $options"
31	echo
32	echo "$options - as following:"
33	echo "config            Default mode. Allocate hugepages and bind PCI devices."
34	if [[ $os == Linux ]]; then
35		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
36	fi
37	echo "reset             Rebind PCI devices back to their original drivers."
38	echo "                  Also cleanup any leftover spdk files/resources."
39	echo "                  Hugepage memory size will remain unchanged."
40	if [[ $os == Linux ]]; then
41		echo "status            Print status of all SPDK-compatible devices on the system."
42	fi
43	echo "help              Print this help message."
44	echo
45	echo "The following environment variables can be specified."
46	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
47	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
48	echo "                  default."
49	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
50	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
51	echo "                  Uses kernel's default for hugepages size."
52	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
53	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
54	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
55	echo "                  Hugepages can be defined per node with e.g.:"
56	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
57	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
58	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
59	echo "                  setting is used."
60	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
61	echo "                  be made prior to allocation".
62	echo "PCI_ALLOWED"
63	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
64	echo "                  Each device must be specified as a full PCI address."
65	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
66	echo "                  To block all PCI devices use a non-valid address."
67	echo "                  E.g. PCI_BLOCKED=\"none\""
68	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
69	echo "                  will be bound."
70	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
71	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
72	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
73	echo "                  By default the current user will be used."
74	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
75	echo "                  bind devices to the given driver."
76	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
77	echo "PCI_BLOCK_SYNC_ON_RESET"
78	echo "                  If set in the environment, the attempt to wait for block devices associated"
79	echo "                  with given PCI device will be made upon reset"
80	exit 0
81}
82
83# In monolithic kernels the lsmod won't work. So
84# back that with a /sys/modules. We also check
85# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
86# contain needed info (like in Fedora-like OS).
87function check_for_driver() {
88	if lsmod | grep -q ${1//-/_}; then
89		return 1
90	fi
91
92	if [[ -d /sys/module/${1} || -d \
93		/sys/module/${1//-/_} || -d \
94		/sys/bus/pci/drivers/${1} || -d \
95		/sys/bus/pci/drivers/${1//-/_} ]]; then
96		return 2
97	fi
98	return 0
99}
100
101function check_for_driver_freebsd() {
102	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
103	local search_paths path driver
104	IFS=";" read -ra search_paths < <(kldconfig -rU)
105
106	for driver in contigmem.ko nic_uio.ko; do
107		for path in "${search_paths[@]}"; do
108			[[ -f $path/$driver ]] && continue 2
109		done
110		return 1
111	done
112	return 0
113}
114
115function pci_dev_echo() {
116	local bdf="$1"
117	shift
118	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
119}
120
121function linux_bind_driver() {
122	bdf="$1"
123	driver_name="$2"
124	old_driver_name=${drivers_d["$bdf"]:-no driver}
125	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
126
127	if [[ $driver_name == "$old_driver_name" ]]; then
128		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
129		return 0
130	fi
131
132	if [[ $old_driver_name != "no driver" ]]; then
133		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
134		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
135	fi
136
137	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
138
139	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
140	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
141
142	if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then
143		# Check if the uio_pci_generic driver is broken as it might be in
144		# some 4.18.x kernels (see centos8 for instance) - if our device
145		# didn't get a proper uio entry, fallback to igb_uio
146		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
147			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
148			drivers_d["$bdf"]="no driver"
149			# This call will override $driver_name for remaining devices as well
150			linux_bind_driver "$bdf" igb_uio
151		fi
152	fi
153
154	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
155	if [ -e "/dev/vfio/$iommu_group" ]; then
156		if [ -n "$TARGET_USER" ]; then
157			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
158		fi
159	fi
160}
161
162function linux_unbind_driver() {
163	local bdf="$1"
164	local ven_dev_id
165	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
166	local old_driver_name=${drivers_d["$bdf"]:-no driver}
167
168	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
169		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
170		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
171	fi
172
173	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
174}
175
176function linux_hugetlbfs_mounts() {
177	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
178}
179
180function get_block_dev_from_bdf() {
181	local bdf=$1
182	local block
183
184	for block in /sys/block/*; do
185		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
186			echo "${block##*/}"
187		fi
188	done
189}
190
191function get_mounted_part_dev_from_bdf_block() {
192	local bdf=$1
193	local blocks block dev mount
194
195	hash lsblk || return 1
196	blocks=($(get_block_dev_from_bdf "$bdf"))
197
198	for block in "${blocks[@]}"; do
199		while read -r dev mount; do
200			if [[ -e $mount ]]; then
201				echo "$block:$dev"
202			fi
203		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
204	done
205}
206
207function collect_devices() {
208	# NVMe, IOAT, IDXD, VIRTIO, VMD
209
210	local ids dev_type dev_id bdf bdfs in_use driver
211
212	ids+="PCI_DEVICE_ID_INTEL_IOAT"
213	ids+="|PCI_DEVICE_ID_INTEL_IDXD"
214	ids+="|PCI_DEVICE_ID_VIRTIO"
215	ids+="|PCI_DEVICE_ID_INTEL_VMD"
216	ids+="|SPDK_PCI_CLASS_NVME"
217
218	local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d
219
220	while read -r _ dev_type dev_id; do
221		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
222		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
223		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
224		[[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
225		for bdf in "${bdfs[@]}"; do
226			in_use=0
227			if [[ $1 != status ]]; then
228				if ! pci_can_use "$bdf"; then
229					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
230					in_use=1
231				fi
232				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
233					if ! verify_bdf_mounts "$bdf"; then
234						in_use=1
235					fi
236				fi
237				if [[ $dev_type == vmd ]]; then
238					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
239						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
240						in_use=1
241					fi
242				fi
243			fi
244			eval "${dev_type}_d[$bdf]=$in_use"
245			all_devices_d["$bdf"]=$in_use
246			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
247				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
248				drivers_d["$bdf"]=${driver##*/}
249			fi
250		done
251	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
252}
253
254function collect_driver() {
255	local bdf=$1
256	local drivers driver
257
258	[[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1
259	if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
260		# Pick first entry in case multiple aliases are bound to a driver.
261		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
262		driver=${driver##*/}
263	else
264		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
265		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
266		[[ -n ${idxd_d["$bdf"]} ]] && driver=idxd
267		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
268		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
269	fi 2> /dev/null
270	echo "$driver"
271}
272
273function verify_bdf_mounts() {
274	local bdf=$1
275	local blknames
276	blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) || return 1
277
278	if ((${#blknames[@]} > 0)); then
279		local IFS=","
280		pci_dev_echo "$bdf" "Active mountpoints on ${blknames[*]}, so not binding PCI dev"
281		return 1
282	fi
283}
284
285function configure_linux_pci() {
286	local driver_path=""
287	driver_name=""
288	igb_uio_fallback=""
289
290	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
291		# igb_uio is a common driver to override with and it depends on uio.
292		modprobe uio || true
293		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
294			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
295		fi
296	fi
297
298	if [[ -n "${DRIVER_OVERRIDE}" ]]; then
299		driver_path="$DRIVER_OVERRIDE"
300		driver_name="${DRIVER_OVERRIDE##*/}"
301		# modprobe and the sysfs don't use the .ko suffix.
302		driver_name=${driver_name%.ko}
303		# path = name -> there is no path
304		if [[ "$driver_path" = "$driver_name" ]]; then
305			driver_path=""
306		fi
307	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
308	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
309	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
310		driver_name=vfio-pci
311		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
312		# should be done automatically by modprobe since this particular module should
313		# be a part of vfio-pci dependencies, however, on some distros, it seems that
314		# it's not the case. See #1689.
315		if modinfo vfio_iommu_type1 > /dev/null; then
316			modprobe vfio_iommu_type1
317		fi
318	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
319		driver_name=uio_pci_generic
320	elif [[ -e $igb_uio_fallback ]]; then
321		driver_path="$igb_uio_fallback"
322		driver_name="igb_uio"
323		echo "WARNING: uio_pci_generic not detected - using $driver_name"
324	else
325		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
326		return 1
327	fi
328
329	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
330	if [[ -n "$driver_path" ]]; then
331		insmod $driver_path || true
332	else
333		modprobe $driver_name
334	fi
335
336	for bdf in "${!all_devices_d[@]}"; do
337		if ((all_devices_d["$bdf"] == 0)); then
338			if [[ -n ${nvme_d["$bdf"]} ]]; then
339				# Some nvme controllers may take significant amount of time while being
340				# unbound from the driver. Put that task into background to speed up the
341				# whole process. Currently this is done only for the devices bound to the
342				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
343				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
344				linux_bind_driver "$bdf" "$driver_name" &
345			else
346				linux_bind_driver "$bdf" "$driver_name"
347			fi
348		fi
349	done
350	wait
351
352	echo "1" > "/sys/bus/pci/rescan"
353}
354
355function cleanup_linux() {
356	shopt -s extglob nullglob
357	dirs_to_clean=""
358	dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) "
359	if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then
360		dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) "
361	fi
362
363	files_to_clean=""
364	for dir in $dirs_to_clean; do
365		files_to_clean+="$(echo $dir/*) "
366	done
367	shopt -u extglob nullglob
368
369	files_to_clean+="$(ls -1 /dev/shm/* \
370		| grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) "
371	files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)"
372	if [[ -z "$files_to_clean" ]]; then
373		echo "Clean"
374		return 0
375	fi
376
377	shopt -s extglob
378	for fd_dir in $(echo /proc/+([0-9])); do
379		opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)"
380	done
381	shopt -u extglob
382
383	if [[ -z "$opened_files" ]]; then
384		echo "Can't get list of opened files!"
385		exit 1
386	fi
387
388	echo 'Cleaning'
389	for f in $files_to_clean; do
390		if ! echo "$opened_files" | grep -E -q "^$f\$"; then
391			echo "Removing:    $f"
392			rm $f
393		else
394			echo "Still open: $f"
395		fi
396	done
397
398	for dir in $dirs_to_clean; do
399		if ! echo "$opened_files" | grep -E -q "^$dir\$"; then
400			echo "Removing:    $dir"
401			rmdir $dir
402		else
403			echo "Still open: $dir"
404		fi
405	done
406	echo "Clean"
407
408	unset dirs_to_clean files_to_clean opened_files
409}
410
411check_hugepages_alloc() {
412	local hp_int=$1
413	local allocated_hugepages
414
415	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
416
417	allocated_hugepages=$(< "$hp_int")
418	if ((allocated_hugepages < NRHUGE)); then
419		cat <<- ERROR
420
421			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
422			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
423		ERROR
424		return 1
425	fi
426}
427
428clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
429
430configure_linux_hugepages() {
431	local node system_nodes
432	local nodes_to_use nodes_hp
433
434	if [[ $CLEAR_HUGE == yes ]]; then
435		clear_hugepages
436	fi
437
438	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
439		clear_hugepages
440		check_hugepages_alloc /proc/sys/vm/nr_hugepages
441		return 0
442	fi
443
444	for node in /sys/devices/system/node/node*; do
445		[[ -e $node ]] || continue
446		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
447	done
448
449	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
450	if ((${#nodes_to_use[@]} == 0)); then
451		nodes_to_use[0]=0
452	fi
453
454	# Align indexes with node ids
455	for node in "${!nodes_to_use[@]}"; do
456		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
457			eval "${nodes_to_use[node]}"
458		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
459			nodes_hp[nodes_to_use[node]]=$NRHUGE
460		fi
461	done
462
463	for node in "${!nodes_hp[@]}"; do
464		if [[ -z ${nodes[node]} ]]; then
465			echo "Node $node doesn't exist, ignoring" >&2
466			continue
467		fi
468		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
469	done
470}
471
472function configure_linux() {
473	configure_linux_pci
474	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
475
476	if [ -z "$hugetlbfs_mounts" ]; then
477		hugetlbfs_mounts=/mnt/huge
478		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
479		mkdir -p "$hugetlbfs_mounts"
480		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
481	fi
482
483	configure_linux_hugepages
484
485	if [ "$driver_name" = "vfio-pci" ]; then
486		if [ -n "$TARGET_USER" ]; then
487			for mount in $hugetlbfs_mounts; do
488				chown "$TARGET_USER" "$mount"
489				chmod g+w "$mount"
490			done
491
492			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
493			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
494				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
495				cat <<- MEMLOCK
496					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
497
498					This is the maximum amount of memory you will be
499					able to use with DPDK and VFIO if run as user "$TARGET_USER".
500					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
501				MEMLOCK
502				if ((MEMLOCK_AMNT < 65536)); then
503					echo ""
504					echo "## WARNING: memlock limit is less than 64MB"
505					echo -n "## DPDK with VFIO may not be able to initialize "
506					echo "if run as user \"$TARGET_USER\"."
507				fi
508			fi
509		fi
510	fi
511
512	if [ ! -e /dev/cpu/0/msr ]; then
513		# Some distros build msr as a module.  Make sure it's loaded to ensure
514		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
515		#  sleeps.
516		modprobe msr || true
517	fi
518}
519
520function reset_linux_pci() {
521	# virtio
522	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
523	# Requires some more investigation - for example, some kernels do not seem to have
524	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
525	#  underscore vs. dash right in the virtio_scsi name.
526	modprobe virtio-pci || true
527	for bdf in "${!all_devices_d[@]}"; do
528		((all_devices_d["$bdf"] == 0)) || continue
529
530		driver=$(collect_driver "$bdf")
531		if ! check_for_driver "$driver"; then
532			linux_bind_driver "$bdf" "$driver"
533		else
534			linux_unbind_driver "$bdf"
535		fi
536	done
537
538	echo "1" > "/sys/bus/pci/rescan"
539}
540
541function reset_linux() {
542	reset_linux_pci
543	for mount in $(linux_hugetlbfs_mounts); do
544		for hp in "$mount"/spdk*map_*; do
545			flock -n "$hp" true && rm -f "$hp"
546		done
547	done
548	rm -f /run/.spdk*
549}
550
551function status_linux() {
552	echo "Hugepages" >&2
553	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
554
555	numa_nodes=0
556	shopt -s nullglob
557	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
558		numa_nodes=$((numa_nodes + 1))
559		free_pages=$(cat $path/free_hugepages)
560		all_pages=$(cat $path/nr_hugepages)
561
562		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
563
564		node=${BASH_REMATCH[1]}
565		huge_size=${BASH_REMATCH[2]}
566
567		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
568	done
569	shopt -u nullglob
570
571	# fall back to system-wide hugepages
572	if [ "$numa_nodes" = "0" ]; then
573		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
574		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
575		node="-"
576		huge_size="$HUGEPGSZ"
577
578		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
579	fi
580
581	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
582		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
583
584	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
585
586	for bdf in "${sorted_bdfs[@]}"; do
587		driver=${drivers_d["$bdf"]}
588		if [ "$numa_nodes" = "0" ]; then
589			node="-"
590		else
591			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
592			if ((node == -1)); then
593				node=unknown
594			fi
595		fi
596		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
597			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
598		else
599			name="-"
600		fi
601
602		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
603			blknames=($(get_block_dev_from_bdf "$bdf"))
604		else
605			blknames=("-")
606		fi
607
608		desc=""
609		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
610		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
611		desc=${desc:-${idxd_d["$bdf"]:+IDXD}}
612		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
613		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
614
615		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
616			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
617			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
618	done
619}
620
621function status_freebsd() {
622	local pci
623
624	status_print() (
625		local dev driver
626
627		echo -e "BDF\t\tVendor\tDevice\tDriver"
628
629		for pci; do
630			driver=$(pciconf -l "pci$pci")
631			driver=${driver%@*}
632			printf '%s\t%s\t%s\t%s\n' \
633				"$pci" \
634				"${pci_ids_vendor["$pci"]}" \
635				"${pci_ids_device["$pci"]}" \
636				"$driver"
637		done | sort -k1,1
638	)
639
640	local contigmem=present
641	local contigmem_buffer_size
642	local contigmem_num_buffers
643
644	if ! kldstat -q -m contigmem; then
645		contigmem="not present"
646	fi
647	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
648		contigmem_buffer_size="not set"
649	fi
650	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
651		contigmem_num_buffers="not set"
652	fi
653
654	cat <<- BSD_INFO
655		Contigmem ($contigmem)
656		Buffer Size: $contigmem_buffer_size
657		Num Buffers: $contigmem_num_buffers
658
659		NVMe devices
660		$(status_print "${!nvme_d[@]}")
661
662		I/IOAT DMA
663		$(status_print "${!ioat_d[@]}")
664
665		IDXD DMA
666		$(status_print "${!idxd_d[@]}")
667
668		VMD
669		$(status_print "${!vmd_d[@]}")
670	BSD_INFO
671}
672
673function configure_freebsd_pci() {
674	local BDFS
675
676	BDFS+=("${!nvme_d[@]}")
677	BDFS+=("${!ioat_d[@]}")
678	BDFS+=("${!idxd_d[@]}")
679	BDFS+=("${!vmd_d[@]}")
680
681	# Drop the domain part from all the addresses
682	BDFS=("${BDFS[@]#*:}")
683
684	local IFS=","
685	kldunload nic_uio.ko || true
686	kenv hw.nic_uio.bdfs="${BDFS[*]}"
687	kldload nic_uio.ko
688}
689
690function configure_freebsd() {
691	if ! check_for_driver_freebsd; then
692		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
693		return 1
694	fi
695	configure_freebsd_pci
696	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
697	#  previous value, unload contigmem so that we can reload with the new value.
698	if kldstat -q -m contigmem; then
699		# contigmem may be loaded, but the kernel environment doesn't have to
700		# be necessarily set at this point. If it isn't, kenv will fail to
701		# pick up the hw. options. Handle it.
702		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
703			contigmem_num_buffers=-1
704		fi 2> /dev/null
705		if ((contigmem_num_buffers != HUGEMEM / 256)); then
706			kldunload contigmem.ko
707		fi
708	fi
709	if ! kldstat -q -m contigmem; then
710		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
711		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
712		kldload contigmem.ko
713	fi
714}
715
716function reset_freebsd() {
717	kldunload contigmem.ko || true
718	kldunload nic_uio.ko || true
719}
720
721CMD=reset cache_pci_bus
722
723mode=$1
724
725if [ -z "$mode" ]; then
726	mode="config"
727fi
728
729: ${HUGEMEM:=2048}
730: ${PCI_ALLOWED:=""}
731: ${PCI_BLOCKED:=""}
732
733if [ -n "$NVME_ALLOWED" ]; then
734	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
735fi
736
737if [ -n "$SKIP_PCI" ]; then
738	PCI_ALLOWED="none"
739fi
740
741if [ -z "$TARGET_USER" ]; then
742	TARGET_USER="$SUDO_USER"
743	if [ -z "$TARGET_USER" ]; then
744		TARGET_USER=$(logname 2> /dev/null) || true
745	fi
746fi
747
748collect_devices "$mode"
749
750if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
751	# Note that this will wait only for the first block device attached to
752	# a given storage controller. For nvme this may miss some of the devs
753	# in case multiple namespaces are being in place.
754	# FIXME: Wait for nvme controller(s) to be in live state and determine
755	# number of configured namespaces, build list of potential block devs
756	# and pass them to sync_dev_uevents. Is it worth the effort?
757	bdfs_to_wait_for=()
758	for bdf in "${!all_devices_d[@]}"; do
759		((all_devices_d["$bdf"] == 0)) || continue
760		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
761			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
762			bdfs_to_wait_for+=("$bdf")
763		fi
764	done
765	if ((${#bdfs_to_wait_for[@]} > 0)); then
766		echo "Waiting for block devices as requested"
767		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
768		"$rootdir/scripts/sync_dev_uevents.sh" \
769			block/disk \
770			"${bdfs_to_wait_for[@]}" &
771		sync_pid=$!
772	fi
773fi
774
775if [[ $os == Linux ]]; then
776	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
777		echo "${HUGEPGSZ}kB is not supported by the running kernel, ingoring" >&2
778		unset -v HUGEPGSZ
779	fi
780
781	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
782	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
783	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
784
785	if [ "$mode" == "config" ]; then
786		configure_linux
787	elif [ "$mode" == "cleanup" ]; then
788		cleanup_linux
789	elif [ "$mode" == "reset" ]; then
790		reset_linux
791	elif [ "$mode" == "status" ]; then
792		status_linux
793	elif [ "$mode" == "help" ]; then
794		usage $0
795	else
796		usage $0 "Invalid argument '$mode'"
797	fi
798else
799	if [ "$mode" == "config" ]; then
800		configure_freebsd
801	elif [ "$mode" == "reset" ]; then
802		reset_freebsd
803	elif [ "$mode" == "cleanup" ]; then
804		echo "setup.sh cleanup function not yet supported on $os"
805	elif [ "$mode" == "status" ]; then
806		status_freebsd
807	elif [ "$mode" == "help" ]; then
808		usage $0
809	else
810		usage $0 "Invalid argument '$mode'"
811	fi
812fi
813
814if [[ -e /proc/$sync_pid/status ]]; then
815	wait "$sync_pid"
816fi
817