xref: /spdk/scripts/setup.sh (revision 3795c1cfd813133ff3a70d6e24539f1cb6b14d8d)
1#!/usr/bin/env bash
2
3set -e
4
5os=$(uname -s)
6
7if [[ $os != Linux && $os != FreeBSD ]]; then
8	echo "Not supported platform ($os), aborting"
9	exit 1
10fi
11
12rootdir=$(readlink -f $(dirname $0))/..
13source "$rootdir/scripts/common.sh"
14
15function usage() {
16	if [[ $os == Linux ]]; then
17		options="[config|reset|status|cleanup|help]"
18	else
19		options="[config|reset|help]"
20	fi
21
22	[[ -n $2 ]] && (
23		echo "$2"
24		echo ""
25	)
26	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
27	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
28	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
29	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
30	echo "Usage: $(basename $1) $options"
31	echo
32	echo "$options - as following:"
33	echo "config            Default mode. Allocate hugepages and bind PCI devices."
34	if [[ $os == Linux ]]; then
35		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
36	fi
37	echo "reset             Rebind PCI devices back to their original drivers."
38	echo "                  Also cleanup any leftover spdk files/resources."
39	echo "                  Hugepage memory size will remain unchanged."
40	if [[ $os == Linux ]]; then
41		echo "status            Print status of all SPDK-compatible devices on the system."
42	fi
43	echo "help              Print this help message."
44	echo
45	echo "The following environment variables can be specified."
46	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
47	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
48	echo "                  default."
49	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
50	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
51	echo "                  Uses kernel's default for hugepages size."
52	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
53	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
54	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
55	echo "                  Hugepages can be defined per node with e.g.:"
56	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
57	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
58	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
59	echo "                  setting is used."
60	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
61	echo "                  be made prior to allocation".
62	echo "PCI_ALLOWED"
63	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
64	echo "                  Each device must be specified as a full PCI address."
65	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
66	echo "                  To block all PCI devices use a non-valid address."
67	echo "                  E.g. PCI_BLOCKED=\"none\""
68	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
69	echo "                  will be bound."
70	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
71	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
72	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
73	echo "                  By default the current user will be used."
74	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
75	echo "                  bind devices to the given driver."
76	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
77	echo "PCI_BLOCK_SYNC_ON_RESET"
78	echo "                  If set in the environment, the attempt to wait for block devices associated"
79	echo "                  with given PCI device will be made upon reset"
80	exit 0
81}
82
83# In monolithic kernels the lsmod won't work. So
84# back that with a /sys/modules. We also check
85# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
86# contain needed info (like in Fedora-like OS).
87function check_for_driver() {
88	if lsmod | grep -q ${1//-/_}; then
89		return 1
90	fi
91
92	if [[ -d /sys/module/${1} || -d \
93		/sys/module/${1//-/_} || -d \
94		/sys/bus/pci/drivers/${1} || -d \
95		/sys/bus/pci/drivers/${1//-/_} ]]; then
96		return 2
97	fi
98	return 0
99}
100
101function check_for_driver_freebsd() {
102	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
103	local search_paths path driver
104	IFS=";" read -ra search_paths < <(kldconfig -rU)
105
106	for driver in contigmem.ko nic_uio.ko; do
107		for path in "${search_paths[@]}"; do
108			[[ -f $path/$driver ]] && continue 2
109		done
110		return 1
111	done
112	return 0
113}
114
115function pci_dev_echo() {
116	local bdf="$1"
117	shift
118	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
119}
120
121function linux_bind_driver() {
122	bdf="$1"
123	driver_name="$2"
124	old_driver_name=${drivers_d["$bdf"]:-no driver}
125	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
126
127	if [[ $driver_name == "$old_driver_name" ]]; then
128		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
129		return 0
130	fi
131
132	if [[ $old_driver_name != "no driver" ]]; then
133		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
134		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
135	fi
136
137	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
138
139	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
140	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
141
142	if [[ $driver_name == uio_pci_generic && -e /sys/module/igb_uio ]]; then
143		# Check if the uio_pci_generic driver is broken as it might be in
144		# some 4.18.x kernels (see centos8 for instance) - if our device
145		# didn't get a proper uio entry, fallback to igb_uio
146		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
147			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
148			drivers_d["$bdf"]="no driver"
149			# This call will override $driver_name for remaining devices as well
150			linux_bind_driver "$bdf" igb_uio
151		fi
152	fi
153
154	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
155	if [ -e "/dev/vfio/$iommu_group" ]; then
156		if [ -n "$TARGET_USER" ]; then
157			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
158		fi
159	fi
160}
161
162function linux_unbind_driver() {
163	local bdf="$1"
164	local ven_dev_id
165	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
166	local old_driver_name=${drivers_d["$bdf"]:-no driver}
167
168	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
169		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
170		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
171	fi
172
173	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
174}
175
176function linux_hugetlbfs_mounts() {
177	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
178}
179
180function get_block_dev_from_bdf() {
181	local bdf=$1
182	local block
183
184	for block in /sys/block/*; do
185		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
186			echo "${block##*/}"
187		fi
188	done
189}
190
191function get_mounted_part_dev_from_bdf_block() {
192	local bdf=$1
193	local blocks block dev mount
194
195	hash lsblk || return 1
196	blocks=($(get_block_dev_from_bdf "$bdf"))
197
198	for block in "${blocks[@]}"; do
199		while read -r dev mount; do
200			if [[ -e $mount ]]; then
201				echo "$block:$dev"
202			fi
203		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
204	done
205}
206
207function collect_devices() {
208	# NVMe, IOAT, IDXD, VIRTIO, VMD
209
210	local ids dev_type dev_id bdf bdfs in_use driver
211
212	ids+="PCI_DEVICE_ID_INTEL_IOAT"
213	ids+="|PCI_DEVICE_ID_INTEL_IDXD"
214	ids+="|PCI_DEVICE_ID_VIRTIO"
215	ids+="|PCI_DEVICE_ID_INTEL_VMD"
216	ids+="|SPDK_PCI_CLASS_NVME"
217
218	local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d
219
220	while read -r _ dev_type dev_id; do
221		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
222		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
223		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
224		[[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
225		for bdf in "${bdfs[@]}"; do
226			in_use=0
227			if [[ $1 != status ]]; then
228				if ! pci_can_use "$bdf"; then
229					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
230					in_use=1
231				fi
232				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
233					if ! verify_bdf_mounts "$bdf"; then
234						in_use=1
235					fi
236				fi
237				if [[ $dev_type == vmd ]]; then
238					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
239						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
240						in_use=1
241					fi
242				fi
243			fi
244			eval "${dev_type}_d[$bdf]=$in_use"
245			all_devices_d["$bdf"]=$in_use
246			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
247				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
248				drivers_d["$bdf"]=${driver##*/}
249			fi
250		done
251	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
252}
253
254function collect_driver() {
255	local bdf=$1
256	local drivers driver
257
258	[[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1
259	if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
260		# Pick first entry in case multiple aliases are bound to a driver.
261		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
262		driver=${driver##*/}
263	else
264		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
265		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
266		[[ -n ${idxd_d["$bdf"]} ]] && driver=idxd
267		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
268		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
269	fi 2> /dev/null
270	echo "$driver"
271}
272
273function verify_bdf_mounts() {
274	local bdf=$1
275	local blknames
276	blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) || return 1
277
278	if ((${#blknames[@]} > 0)); then
279		local IFS=","
280		pci_dev_echo "$bdf" "Active mountpoints on ${blknames[*]}, so not binding PCI dev"
281		return 1
282	fi
283}
284
285function configure_linux_pci() {
286	local driver_path=""
287	driver_name=""
288	igb_uio_fallback=""
289
290	# igb_uio is a common driver to override with and it depends on uio.
291	modprobe uio
292	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
293		igb_uio_fallback=$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko
294		insmod "$igb_uio_fallback" || true
295	fi
296
297	if [[ -n "${DRIVER_OVERRIDE}" ]]; then
298		driver_path="$DRIVER_OVERRIDE"
299		driver_name="${DRIVER_OVERRIDE##*/}"
300		# modprobe and the sysfs don't use the .ko suffix.
301		driver_name=${driver_name%.ko}
302		# path = name -> there is no path
303		if [[ "$driver_path" = "$driver_name" ]]; then
304			driver_path=""
305		fi
306	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
307	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
308	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
309		driver_name=vfio-pci
310	elif modinfo uio_pci_generic > /dev/null 2>&1; then
311		driver_name=uio_pci_generic
312	elif [[ -e $igb_uio_fallback ]]; then
313		driver_name="igb_uio"
314		echo "WARNING: uio_pci_generic not detected - using $driver_name"
315	else
316		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please either enable the vfio-pci or uio_pci_generic"
317		echo "kernel modules, or have SPDK build the igb_uio driver by running ./configure --with-igb-uio-driver and recompiling."
318		return 1
319	fi
320
321	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
322	if [[ -n "$driver_path" ]]; then
323		insmod $driver_path || true
324	else
325		modprobe $driver_name
326	fi
327
328	for bdf in "${!all_devices_d[@]}"; do
329		if ((all_devices_d["$bdf"] == 0)); then
330			if [[ -n ${nvme_d["$bdf"]} ]]; then
331				# Some nvme controllers may take significant amount of time while being
332				# unbound from the driver. Put that task into background to speed up the
333				# whole process. Currently this is done only for the devices bound to the
334				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
335				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
336				linux_bind_driver "$bdf" "$driver_name" &
337			else
338				linux_bind_driver "$bdf" "$driver_name"
339			fi
340		fi
341	done
342	wait
343
344	echo "1" > "/sys/bus/pci/rescan"
345}
346
347function cleanup_linux() {
348	shopt -s extglob nullglob
349	dirs_to_clean=""
350	dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) "
351	if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then
352		dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) "
353	fi
354
355	files_to_clean=""
356	for dir in $dirs_to_clean; do
357		files_to_clean+="$(echo $dir/*) "
358	done
359	shopt -u extglob nullglob
360
361	files_to_clean+="$(ls -1 /dev/shm/* \
362		| grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) "
363	files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)"
364	if [[ -z "$files_to_clean" ]]; then
365		echo "Clean"
366		return 0
367	fi
368
369	shopt -s extglob
370	for fd_dir in $(echo /proc/+([0-9])); do
371		opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)"
372	done
373	shopt -u extglob
374
375	if [[ -z "$opened_files" ]]; then
376		echo "Can't get list of opened files!"
377		exit 1
378	fi
379
380	echo 'Cleaning'
381	for f in $files_to_clean; do
382		if ! echo "$opened_files" | grep -E -q "^$f\$"; then
383			echo "Removing:    $f"
384			rm $f
385		else
386			echo "Still open: $f"
387		fi
388	done
389
390	for dir in $dirs_to_clean; do
391		if ! echo "$opened_files" | grep -E -q "^$dir\$"; then
392			echo "Removing:    $dir"
393			rmdir $dir
394		else
395			echo "Still open: $dir"
396		fi
397	done
398	echo "Clean"
399
400	unset dirs_to_clean files_to_clean opened_files
401}
402
403check_hugepages_alloc() {
404	local hp_int=$1
405	local allocated_hugepages
406
407	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
408
409	allocated_hugepages=$(< "$hp_int")
410	if ((allocated_hugepages < NRHUGE)); then
411		cat <<- ERROR
412
413			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
414			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
415		ERROR
416		return 1
417	fi
418}
419
420clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
421
422configure_linux_hugepages() {
423	local node system_nodes
424	local nodes_to_use nodes_hp
425
426	if [[ $CLEAR_HUGE == yes ]]; then
427		clear_hugepages
428	fi
429
430	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
431		clear_hugepages
432		check_hugepages_alloc /proc/sys/vm/nr_hugepages
433		return 0
434	fi
435
436	for node in /sys/devices/system/node/node*; do
437		[[ -e $node ]] || continue
438		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
439	done
440
441	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
442	if ((${#nodes_to_use[@]} == 0)); then
443		nodes_to_use[0]=0
444	fi
445
446	# Align indexes with node ids
447	for node in "${!nodes_to_use[@]}"; do
448		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
449			eval "${nodes_to_use[node]}"
450		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
451			nodes_hp[nodes_to_use[node]]=$NRHUGE
452		fi
453	done
454
455	for node in "${!nodes_hp[@]}"; do
456		if [[ -z ${nodes[node]} ]]; then
457			echo "Node $node doesn't exist, ignoring" >&2
458			continue
459		fi
460		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
461	done
462}
463
464function configure_linux() {
465	configure_linux_pci
466	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
467
468	if [ -z "$hugetlbfs_mounts" ]; then
469		hugetlbfs_mounts=/mnt/huge
470		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
471		mkdir -p "$hugetlbfs_mounts"
472		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
473	fi
474
475	configure_linux_hugepages
476
477	if [ "$driver_name" = "vfio-pci" ]; then
478		if [ -n "$TARGET_USER" ]; then
479			for mount in $hugetlbfs_mounts; do
480				chown "$TARGET_USER" "$mount"
481				chmod g+w "$mount"
482			done
483
484			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
485			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
486				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
487				cat <<- MEMLOCK
488					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
489
490					This is the maximum amount of memory you will be
491					able to use with DPDK and VFIO if run as user "$TARGET_USER".
492					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
493				MEMLOCK
494				if ((MEMLOCK_AMNT < 65536)); then
495					echo ""
496					echo "## WARNING: memlock limit is less than 64MB"
497					echo -n "## DPDK with VFIO may not be able to initialize "
498					echo "if run as user \"$TARGET_USER\"."
499				fi
500			fi
501		fi
502	fi
503
504	if [ ! -f /dev/cpu/0/msr ]; then
505		# Some distros build msr as a module.  Make sure it's loaded to ensure
506		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
507		#  sleeps.
508		modprobe msr || true
509	fi
510}
511
512function reset_linux_pci() {
513	# virtio
514	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
515	# Requires some more investigation - for example, some kernels do not seem to have
516	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
517	#  underscore vs. dash right in the virtio_scsi name.
518	modprobe virtio-pci || true
519	for bdf in "${!all_devices_d[@]}"; do
520		((all_devices_d["$bdf"] == 0)) || continue
521
522		driver=$(collect_driver "$bdf")
523		if ! check_for_driver "$driver"; then
524			linux_bind_driver "$bdf" "$driver"
525		else
526			linux_unbind_driver "$bdf"
527		fi
528	done
529
530	echo "1" > "/sys/bus/pci/rescan"
531}
532
533function reset_linux() {
534	reset_linux_pci
535	for mount in $(linux_hugetlbfs_mounts); do
536		for hp in "$mount"/spdk*map_*; do
537			flock -n "$hp" true && rm -f "$hp"
538		done
539	done
540	rm -f /run/.spdk*
541}
542
543function status_linux() {
544	echo "Hugepages" >&2
545	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
546
547	numa_nodes=0
548	shopt -s nullglob
549	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
550		numa_nodes=$((numa_nodes + 1))
551		free_pages=$(cat $path/free_hugepages)
552		all_pages=$(cat $path/nr_hugepages)
553
554		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
555
556		node=${BASH_REMATCH[1]}
557		huge_size=${BASH_REMATCH[2]}
558
559		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
560	done
561	shopt -u nullglob
562
563	# fall back to system-wide hugepages
564	if [ "$numa_nodes" = "0" ]; then
565		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
566		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
567		node="-"
568		huge_size="$HUGEPGSZ"
569
570		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
571	fi
572
573	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
574		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
575
576	for bdf in "${!all_devices_d[@]}"; do
577		driver=${drivers_d["$bdf"]}
578		if [ "$numa_nodes" = "0" ]; then
579			node="-"
580		else
581			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
582			if ((node == -1)); then
583				node=unknown
584			fi
585		fi
586		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
587			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
588		else
589			name="-"
590		fi
591
592		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
593			blknames=($(get_block_dev_from_bdf "$bdf"))
594		else
595			blknames=("-")
596		fi
597
598		desc=""
599		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
600		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
601		desc=${desc:-${idxd_d["$bdf"]:+IDXD}}
602		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
603		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
604
605		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
606			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
607			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
608	done | sort -bk2,2
609}
610
611function status_freebsd() {
612	local pci
613
614	status_print() (
615		local dev driver
616
617		echo -e "BDF\t\tVendor\tDevice\tDriver"
618
619		for pci; do
620			driver=$(pciconf -l "pci$pci")
621			driver=${driver%@*}
622			printf '%s\t%s\t%s\t%s\n' \
623				"$pci" \
624				"${pci_ids_vendor["$pci"]}" \
625				"${pci_ids_device["$pci"]}" \
626				"$driver"
627		done | sort -k1,1
628	)
629
630	local contigmem=present
631	local contigmem_buffer_size
632	local contigmem_num_buffers
633
634	if ! kldstat -q -m contigmem; then
635		contigmem="not present"
636	fi
637	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
638		contigmem_buffer_size="not set"
639	fi
640	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
641		contigmem_num_buffers="not set"
642	fi
643
644	cat <<- BSD_INFO
645		Contigmem ($contigmem)
646		Buffer Size: $contigmem_buffer_size
647		Num Buffers: $contigmem_num_buffers
648
649		NVMe devices
650		$(status_print "${!nvme_d[@]}")
651
652		I/IOAT DMA
653		$(status_print "${!ioat_d[@]}")
654
655		IDXD DMA
656		$(status_print "${!idxd_d[@]}")
657
658		VMD
659		$(status_print "${!vmd_d[@]}")
660	BSD_INFO
661}
662
663function configure_freebsd_pci() {
664	local BDFS
665
666	BDFS+=("${!nvme_d[@]}")
667	BDFS+=("${!ioat_d[@]}")
668	BDFS+=("${!idxd_d[@]}")
669	BDFS+=("${!vmd_d[@]}")
670
671	# Drop the domain part from all the addresses
672	BDFS=("${BDFS[@]#*:}")
673
674	local IFS=","
675	kldunload nic_uio.ko || true
676	kenv hw.nic_uio.bdfs="${BDFS[*]}"
677	kldload nic_uio.ko
678}
679
680function configure_freebsd() {
681	if ! check_for_driver_freebsd; then
682		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
683		return 1
684	fi
685	configure_freebsd_pci
686	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
687	#  previous value, unload contigmem so that we can reload with the new value.
688	if kldstat -q -m contigmem; then
689		# contigmem may be loaded, but the kernel environment doesn't have to
690		# be necessarily set at this point. If it isn't, kenv will fail to
691		# pick up the hw. options. Handle it.
692		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
693			contigmem_num_buffers=-1
694		fi 2> /dev/null
695		if ((contigmem_num_buffers != HUGEMEM / 256)); then
696			kldunload contigmem.ko
697		fi
698	fi
699	if ! kldstat -q -m contigmem; then
700		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
701		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
702		kldload contigmem.ko
703	fi
704}
705
706function reset_freebsd() {
707	kldunload contigmem.ko || true
708	kldunload nic_uio.ko || true
709}
710
711CMD=reset cache_pci_bus
712
713mode=$1
714
715if [ -z "$mode" ]; then
716	mode="config"
717fi
718
719: ${HUGEMEM:=2048}
720: ${PCI_ALLOWED:=""}
721: ${PCI_BLOCKED:=""}
722
723if [ -n "$NVME_ALLOWED" ]; then
724	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
725fi
726
727if [ -n "$SKIP_PCI" ]; then
728	PCI_ALLOWED="none"
729fi
730
731if [ -z "$TARGET_USER" ]; then
732	TARGET_USER="$SUDO_USER"
733	if [ -z "$TARGET_USER" ]; then
734		TARGET_USER=$(logname 2> /dev/null) || true
735	fi
736fi
737
738collect_devices "$mode"
739
740if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
741	# Note that this will wait only for the first block device attached to
742	# a given storage controller. For nvme this may miss some of the devs
743	# in case multiple namespaces are being in place.
744	# FIXME: Wait for nvme controller(s) to be in live state and determine
745	# number of configured namespaces, build list of potential block devs
746	# and pass them to sync_dev_uevents. Is it worth the effort?
747	bdfs_to_wait_for=()
748	for bdf in "${!all_devices_d[@]}"; do
749		((all_devices_d["$bdf"] == 0)) || continue
750		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
751			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
752			bdfs_to_wait_for+=("$bdf")
753		fi
754	done
755	if ((${#bdfs_to_wait_for[@]} > 0)); then
756		echo "Waiting for block devices as requested"
757		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
758		"$rootdir/scripts/sync_dev_uevents.sh" \
759			block/disk \
760			"${bdfs_to_wait_for[@]}" &
761		sync_pid=$!
762	fi
763fi
764
765if [[ $os == Linux ]]; then
766	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
767		echo "${HUGEPGSZ}kB is not supported by the running kernel, ingoring" >&2
768		unset -v HUGEPGSZ
769	fi
770
771	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
772	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
773	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
774
775	if [ "$mode" == "config" ]; then
776		configure_linux
777	elif [ "$mode" == "cleanup" ]; then
778		cleanup_linux
779	elif [ "$mode" == "reset" ]; then
780		reset_linux
781	elif [ "$mode" == "status" ]; then
782		status_linux
783	elif [ "$mode" == "help" ]; then
784		usage $0
785	else
786		usage $0 "Invalid argument '$mode'"
787	fi
788else
789	if [ "$mode" == "config" ]; then
790		configure_freebsd
791	elif [ "$mode" == "reset" ]; then
792		reset_freebsd
793	elif [ "$mode" == "cleanup" ]; then
794		echo "setup.sh cleanup function not yet supported on $os"
795	elif [ "$mode" == "status" ]; then
796		status_freebsd
797	elif [ "$mode" == "help" ]; then
798		usage $0
799	else
800		usage $0 "Invalid argument '$mode'"
801	fi
802fi
803
804if [[ -e /proc/$sync_pid/status ]]; then
805	wait "$sync_pid"
806fi
807