xref: /spdk/scripts/setup.sh (revision b30d57cdad6d2bc75cc1e4e2ebbcebcb0d98dcfa)
1#!/usr/bin/env bash
2
3set -e
4
5os=$(uname -s)
6
7if [[ $os != Linux && $os != FreeBSD ]]; then
8	echo "Not supported platform ($os), aborting"
9	exit 1
10fi
11
12rootdir=$(readlink -f $(dirname $0))/..
13source "$rootdir/scripts/common.sh"
14
15function usage() {
16	if [[ $os == Linux ]]; then
17		options="[config|reset|status|cleanup|help]"
18	else
19		options="[config|reset|help]"
20	fi
21
22	[[ -n $2 ]] && (
23		echo "$2"
24		echo ""
25	)
26	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
27	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
28	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
29	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
30	echo "Usage: $(basename $1) $options"
31	echo
32	echo "$options - as following:"
33	echo "config            Default mode. Allocate hugepages and bind PCI devices."
34	if [[ $os == Linux ]]; then
35		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
36	fi
37	echo "reset             Rebind PCI devices back to their original drivers."
38	echo "                  Also cleanup any leftover spdk files/resources."
39	echo "                  Hugepage memory size will remain unchanged."
40	if [[ $os == Linux ]]; then
41		echo "status            Print status of all SPDK-compatible devices on the system."
42	fi
43	echo "help              Print this help message."
44	echo
45	echo "The following environment variables can be specified."
46	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
47	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
48	echo "                  default."
49	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
50	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
51	echo "                  Uses kernel's default for hugepages size."
52	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
53	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
54	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
55	echo "                  Hugepages can be defined per node with e.g.:"
56	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
57	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
58	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
59	echo "                  setting is used."
60	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
61	echo "                  be made prior to allocation".
62	echo "PCI_ALLOWED"
63	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
64	echo "                  Each device must be specified as a full PCI address."
65	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
66	echo "                  To block all PCI devices use a non-valid address."
67	echo "                  E.g. PCI_BLOCKED=\"none\""
68	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
69	echo "                  will be bound."
70	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
71	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
72	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
73	echo "                  By default the current user will be used."
74	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
75	echo "                  bind devices to the given driver."
76	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
77	echo "PCI_BLOCK_SYNC_ON_RESET"
78	echo "                  If set in the environment, the attempt to wait for block devices associated"
79	echo "                  with given PCI device will be made upon reset"
80	exit 0
81}
82
83# In monolithic kernels the lsmod won't work. So
84# back that with a /sys/modules. We also check
85# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
86# contain needed info (like in Fedora-like OS).
87function check_for_driver() {
88	if lsmod | grep -q ${1//-/_}; then
89		return 1
90	fi
91
92	if [[ -d /sys/module/${1} || -d \
93		/sys/module/${1//-/_} || -d \
94		/sys/bus/pci/drivers/${1} || -d \
95		/sys/bus/pci/drivers/${1//-/_} ]]; then
96		return 2
97	fi
98	return 0
99}
100
101function check_for_driver_freebsd() {
102	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
103	local search_paths path driver
104	IFS=";" read -ra search_paths < <(kldconfig -rU)
105
106	for driver in contigmem.ko nic_uio.ko; do
107		for path in "${search_paths[@]}"; do
108			[[ -f $path/$driver ]] && continue 2
109		done
110		return 1
111	done
112	return 0
113}
114
115function pci_dev_echo() {
116	local bdf="$1"
117	shift
118	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
119}
120
121function linux_bind_driver() {
122	bdf="$1"
123	driver_name="$2"
124	old_driver_name=${drivers_d["$bdf"]:-no driver}
125	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
126
127	if [[ $driver_name == "$old_driver_name" ]]; then
128		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
129		return 0
130	fi
131
132	if [[ $old_driver_name != "no driver" ]]; then
133		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
134		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
135	fi
136
137	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
138
139	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
140	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
141
142	if [[ $driver_name == uio_pci_generic && -e /sys/module/igb_uio ]]; then
143		# Check if the uio_pci_generic driver is broken as it might be in
144		# some 4.18.x kernels (see centos8 for instance) - if our device
145		# didn't get a proper uio entry, fallback to igb_uio
146		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
147			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
148			drivers_d["$bdf"]="no driver"
149			# This call will override $driver_name for remaining devices as well
150			linux_bind_driver "$bdf" igb_uio
151		fi
152	fi
153
154	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
155	if [ -e "/dev/vfio/$iommu_group" ]; then
156		if [ -n "$TARGET_USER" ]; then
157			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
158		fi
159	fi
160}
161
162function linux_unbind_driver() {
163	local bdf="$1"
164	local ven_dev_id
165	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
166	local old_driver_name=${drivers_d["$bdf"]:-no driver}
167
168	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
169		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
170		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
171	fi
172
173	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
174}
175
176function linux_hugetlbfs_mounts() {
177	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
178}
179
180function get_block_dev_from_bdf() {
181	local bdf=$1
182	local block
183
184	for block in /sys/block/*; do
185		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
186			echo "${block##*/}"
187		fi
188	done
189}
190
191function get_mounted_part_dev_from_bdf_block() {
192	local bdf=$1
193	local blocks block dev mount
194
195	hash lsblk || return 1
196	blocks=($(get_block_dev_from_bdf "$bdf"))
197
198	for block in "${blocks[@]}"; do
199		while read -r dev mount; do
200			if [[ -e $mount ]]; then
201				echo "$block:$dev"
202			fi
203		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
204	done
205}
206
207function collect_devices() {
208	# NVMe, IOAT, IDXD, VIRTIO, VMD
209
210	local ids dev_type dev_id bdf bdfs in_use driver
211
212	ids+="PCI_DEVICE_ID_INTEL_IOAT"
213	ids+="|PCI_DEVICE_ID_INTEL_IDXD"
214	ids+="|PCI_DEVICE_ID_VIRTIO"
215	ids+="|PCI_DEVICE_ID_INTEL_VMD"
216	ids+="|SPDK_PCI_CLASS_NVME"
217
218	local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d
219
220	while read -r _ dev_type dev_id; do
221		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
222		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
223		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
224		[[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
225		for bdf in "${bdfs[@]}"; do
226			in_use=0
227			if [[ $1 != status ]]; then
228				if ! pci_can_use "$bdf"; then
229					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
230					in_use=1
231				fi
232				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
233					if ! verify_bdf_mounts "$bdf"; then
234						in_use=1
235					fi
236				fi
237				if [[ $dev_type == vmd ]]; then
238					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
239						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
240						in_use=1
241					fi
242				fi
243			fi
244			eval "${dev_type}_d[$bdf]=$in_use"
245			all_devices_d["$bdf"]=$in_use
246			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
247				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
248				drivers_d["$bdf"]=${driver##*/}
249			fi
250		done
251	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
252}
253
254function collect_driver() {
255	local bdf=$1
256	local drivers driver
257
258	[[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1
259	if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
260		# Pick first entry in case multiple aliases are bound to a driver.
261		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
262		driver=${driver##*/}
263	else
264		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
265		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
266		[[ -n ${idxd_d["$bdf"]} ]] && driver=idxd
267		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
268		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
269	fi 2> /dev/null
270	echo "$driver"
271}
272
273function verify_bdf_mounts() {
274	local bdf=$1
275	local blknames
276	blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) || return 1
277
278	if ((${#blknames[@]} > 0)); then
279		local IFS=","
280		pci_dev_echo "$bdf" "Active mountpoints on ${blknames[*]}, so not binding PCI dev"
281		return 1
282	fi
283}
284
285function configure_linux_pci() {
286	local driver_path=""
287	driver_name=""
288	igb_uio_fallback=""
289
290	# igb_uio is a common driver to override with and it depends on uio.
291	modprobe uio
292	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
293		igb_uio_fallback=$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko
294		insmod "$igb_uio_fallback" || true
295	fi
296
297	if [[ -n "${DRIVER_OVERRIDE}" ]]; then
298		driver_path="$DRIVER_OVERRIDE"
299		driver_name="${DRIVER_OVERRIDE##*/}"
300		# modprobe and the sysfs don't use the .ko suffix.
301		driver_name=${driver_name%.ko}
302		# path = name -> there is no path
303		if [[ "$driver_path" = "$driver_name" ]]; then
304			driver_path=""
305		fi
306	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
307	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
308	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
309		driver_name=vfio-pci
310		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
311		# should be done automatically by modprobe since this particular module should
312		# be a part of vfio-pci dependencies, however, on some distros, it seems that
313		# it's not the case. See #1689.
314		if modinfo vfio_iommu_type1 > /dev/null; then
315			modprobe vfio_iommu_type1
316		fi
317	elif modinfo uio_pci_generic > /dev/null 2>&1; then
318		driver_name=uio_pci_generic
319	elif [[ -e $igb_uio_fallback ]]; then
320		driver_name="igb_uio"
321		echo "WARNING: uio_pci_generic not detected - using $driver_name"
322	else
323		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please either enable the vfio-pci or uio_pci_generic"
324		echo "kernel modules, or have SPDK build the igb_uio driver by running ./configure --with-igb-uio-driver and recompiling."
325		return 1
326	fi
327
328	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
329	if [[ -n "$driver_path" ]]; then
330		insmod $driver_path || true
331	else
332		modprobe $driver_name
333	fi
334
335	for bdf in "${!all_devices_d[@]}"; do
336		if ((all_devices_d["$bdf"] == 0)); then
337			if [[ -n ${nvme_d["$bdf"]} ]]; then
338				# Some nvme controllers may take significant amount of time while being
339				# unbound from the driver. Put that task into background to speed up the
340				# whole process. Currently this is done only for the devices bound to the
341				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
342				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
343				linux_bind_driver "$bdf" "$driver_name" &
344			else
345				linux_bind_driver "$bdf" "$driver_name"
346			fi
347		fi
348	done
349	wait
350
351	echo "1" > "/sys/bus/pci/rescan"
352}
353
354function cleanup_linux() {
355	shopt -s extglob nullglob
356	dirs_to_clean=""
357	dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) "
358	if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then
359		dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) "
360	fi
361
362	files_to_clean=""
363	for dir in $dirs_to_clean; do
364		files_to_clean+="$(echo $dir/*) "
365	done
366	shopt -u extglob nullglob
367
368	files_to_clean+="$(ls -1 /dev/shm/* \
369		| grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) "
370	files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)"
371	if [[ -z "$files_to_clean" ]]; then
372		echo "Clean"
373		return 0
374	fi
375
376	shopt -s extglob
377	for fd_dir in $(echo /proc/+([0-9])); do
378		opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)"
379	done
380	shopt -u extglob
381
382	if [[ -z "$opened_files" ]]; then
383		echo "Can't get list of opened files!"
384		exit 1
385	fi
386
387	echo 'Cleaning'
388	for f in $files_to_clean; do
389		if ! echo "$opened_files" | grep -E -q "^$f\$"; then
390			echo "Removing:    $f"
391			rm $f
392		else
393			echo "Still open: $f"
394		fi
395	done
396
397	for dir in $dirs_to_clean; do
398		if ! echo "$opened_files" | grep -E -q "^$dir\$"; then
399			echo "Removing:    $dir"
400			rmdir $dir
401		else
402			echo "Still open: $dir"
403		fi
404	done
405	echo "Clean"
406
407	unset dirs_to_clean files_to_clean opened_files
408}
409
410check_hugepages_alloc() {
411	local hp_int=$1
412	local allocated_hugepages
413
414	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
415
416	allocated_hugepages=$(< "$hp_int")
417	if ((allocated_hugepages < NRHUGE)); then
418		cat <<- ERROR
419
420			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
421			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
422		ERROR
423		return 1
424	fi
425}
426
427clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
428
429configure_linux_hugepages() {
430	local node system_nodes
431	local nodes_to_use nodes_hp
432
433	if [[ $CLEAR_HUGE == yes ]]; then
434		clear_hugepages
435	fi
436
437	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
438		clear_hugepages
439		check_hugepages_alloc /proc/sys/vm/nr_hugepages
440		return 0
441	fi
442
443	for node in /sys/devices/system/node/node*; do
444		[[ -e $node ]] || continue
445		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
446	done
447
448	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
449	if ((${#nodes_to_use[@]} == 0)); then
450		nodes_to_use[0]=0
451	fi
452
453	# Align indexes with node ids
454	for node in "${!nodes_to_use[@]}"; do
455		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
456			eval "${nodes_to_use[node]}"
457		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
458			nodes_hp[nodes_to_use[node]]=$NRHUGE
459		fi
460	done
461
462	for node in "${!nodes_hp[@]}"; do
463		if [[ -z ${nodes[node]} ]]; then
464			echo "Node $node doesn't exist, ignoring" >&2
465			continue
466		fi
467		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
468	done
469}
470
471function configure_linux() {
472	configure_linux_pci
473	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
474
475	if [ -z "$hugetlbfs_mounts" ]; then
476		hugetlbfs_mounts=/mnt/huge
477		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
478		mkdir -p "$hugetlbfs_mounts"
479		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
480	fi
481
482	configure_linux_hugepages
483
484	if [ "$driver_name" = "vfio-pci" ]; then
485		if [ -n "$TARGET_USER" ]; then
486			for mount in $hugetlbfs_mounts; do
487				chown "$TARGET_USER" "$mount"
488				chmod g+w "$mount"
489			done
490
491			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
492			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
493				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
494				cat <<- MEMLOCK
495					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
496
497					This is the maximum amount of memory you will be
498					able to use with DPDK and VFIO if run as user "$TARGET_USER".
499					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
500				MEMLOCK
501				if ((MEMLOCK_AMNT < 65536)); then
502					echo ""
503					echo "## WARNING: memlock limit is less than 64MB"
504					echo -n "## DPDK with VFIO may not be able to initialize "
505					echo "if run as user \"$TARGET_USER\"."
506				fi
507			fi
508		fi
509	fi
510
511	if [ ! -f /dev/cpu/0/msr ]; then
512		# Some distros build msr as a module.  Make sure it's loaded to ensure
513		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
514		#  sleeps.
515		modprobe msr || true
516	fi
517}
518
519function reset_linux_pci() {
520	# virtio
521	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
522	# Requires some more investigation - for example, some kernels do not seem to have
523	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
524	#  underscore vs. dash right in the virtio_scsi name.
525	modprobe virtio-pci || true
526	for bdf in "${!all_devices_d[@]}"; do
527		((all_devices_d["$bdf"] == 0)) || continue
528
529		driver=$(collect_driver "$bdf")
530		if ! check_for_driver "$driver"; then
531			linux_bind_driver "$bdf" "$driver"
532		else
533			linux_unbind_driver "$bdf"
534		fi
535	done
536
537	echo "1" > "/sys/bus/pci/rescan"
538}
539
540function reset_linux() {
541	reset_linux_pci
542	for mount in $(linux_hugetlbfs_mounts); do
543		for hp in "$mount"/spdk*map_*; do
544			flock -n "$hp" true && rm -f "$hp"
545		done
546	done
547	rm -f /run/.spdk*
548}
549
550function status_linux() {
551	echo "Hugepages" >&2
552	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
553
554	numa_nodes=0
555	shopt -s nullglob
556	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
557		numa_nodes=$((numa_nodes + 1))
558		free_pages=$(cat $path/free_hugepages)
559		all_pages=$(cat $path/nr_hugepages)
560
561		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
562
563		node=${BASH_REMATCH[1]}
564		huge_size=${BASH_REMATCH[2]}
565
566		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
567	done
568	shopt -u nullglob
569
570	# fall back to system-wide hugepages
571	if [ "$numa_nodes" = "0" ]; then
572		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
573		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
574		node="-"
575		huge_size="$HUGEPGSZ"
576
577		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
578	fi
579
580	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
581		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
582
583	for bdf in "${!all_devices_d[@]}"; do
584		driver=${drivers_d["$bdf"]}
585		if [ "$numa_nodes" = "0" ]; then
586			node="-"
587		else
588			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
589			if ((node == -1)); then
590				node=unknown
591			fi
592		fi
593		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
594			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
595		else
596			name="-"
597		fi
598
599		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
600			blknames=($(get_block_dev_from_bdf "$bdf"))
601		else
602			blknames=("-")
603		fi
604
605		desc=""
606		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
607		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
608		desc=${desc:-${idxd_d["$bdf"]:+IDXD}}
609		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
610		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
611
612		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
613			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
614			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
615	done | sort -bk2,2
616}
617
618function status_freebsd() {
619	local pci
620
621	status_print() (
622		local dev driver
623
624		echo -e "BDF\t\tVendor\tDevice\tDriver"
625
626		for pci; do
627			driver=$(pciconf -l "pci$pci")
628			driver=${driver%@*}
629			printf '%s\t%s\t%s\t%s\n' \
630				"$pci" \
631				"${pci_ids_vendor["$pci"]}" \
632				"${pci_ids_device["$pci"]}" \
633				"$driver"
634		done | sort -k1,1
635	)
636
637	local contigmem=present
638	local contigmem_buffer_size
639	local contigmem_num_buffers
640
641	if ! kldstat -q -m contigmem; then
642		contigmem="not present"
643	fi
644	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
645		contigmem_buffer_size="not set"
646	fi
647	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
648		contigmem_num_buffers="not set"
649	fi
650
651	cat <<- BSD_INFO
652		Contigmem ($contigmem)
653		Buffer Size: $contigmem_buffer_size
654		Num Buffers: $contigmem_num_buffers
655
656		NVMe devices
657		$(status_print "${!nvme_d[@]}")
658
659		I/IOAT DMA
660		$(status_print "${!ioat_d[@]}")
661
662		IDXD DMA
663		$(status_print "${!idxd_d[@]}")
664
665		VMD
666		$(status_print "${!vmd_d[@]}")
667	BSD_INFO
668}
669
670function configure_freebsd_pci() {
671	local BDFS
672
673	BDFS+=("${!nvme_d[@]}")
674	BDFS+=("${!ioat_d[@]}")
675	BDFS+=("${!idxd_d[@]}")
676	BDFS+=("${!vmd_d[@]}")
677
678	# Drop the domain part from all the addresses
679	BDFS=("${BDFS[@]#*:}")
680
681	local IFS=","
682	kldunload nic_uio.ko || true
683	kenv hw.nic_uio.bdfs="${BDFS[*]}"
684	kldload nic_uio.ko
685}
686
687function configure_freebsd() {
688	if ! check_for_driver_freebsd; then
689		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
690		return 1
691	fi
692	configure_freebsd_pci
693	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
694	#  previous value, unload contigmem so that we can reload with the new value.
695	if kldstat -q -m contigmem; then
696		# contigmem may be loaded, but the kernel environment doesn't have to
697		# be necessarily set at this point. If it isn't, kenv will fail to
698		# pick up the hw. options. Handle it.
699		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
700			contigmem_num_buffers=-1
701		fi 2> /dev/null
702		if ((contigmem_num_buffers != HUGEMEM / 256)); then
703			kldunload contigmem.ko
704		fi
705	fi
706	if ! kldstat -q -m contigmem; then
707		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
708		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
709		kldload contigmem.ko
710	fi
711}
712
713function reset_freebsd() {
714	kldunload contigmem.ko || true
715	kldunload nic_uio.ko || true
716}
717
718CMD=reset cache_pci_bus
719
720mode=$1
721
722if [ -z "$mode" ]; then
723	mode="config"
724fi
725
726: ${HUGEMEM:=2048}
727: ${PCI_ALLOWED:=""}
728: ${PCI_BLOCKED:=""}
729
730if [ -n "$NVME_ALLOWED" ]; then
731	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
732fi
733
734if [ -n "$SKIP_PCI" ]; then
735	PCI_ALLOWED="none"
736fi
737
738if [ -z "$TARGET_USER" ]; then
739	TARGET_USER="$SUDO_USER"
740	if [ -z "$TARGET_USER" ]; then
741		TARGET_USER=$(logname 2> /dev/null) || true
742	fi
743fi
744
745collect_devices "$mode"
746
747if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
748	# Note that this will wait only for the first block device attached to
749	# a given storage controller. For nvme this may miss some of the devs
750	# in case multiple namespaces are being in place.
751	# FIXME: Wait for nvme controller(s) to be in live state and determine
752	# number of configured namespaces, build list of potential block devs
753	# and pass them to sync_dev_uevents. Is it worth the effort?
754	bdfs_to_wait_for=()
755	for bdf in "${!all_devices_d[@]}"; do
756		((all_devices_d["$bdf"] == 0)) || continue
757		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
758			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
759			bdfs_to_wait_for+=("$bdf")
760		fi
761	done
762	if ((${#bdfs_to_wait_for[@]} > 0)); then
763		echo "Waiting for block devices as requested"
764		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
765		"$rootdir/scripts/sync_dev_uevents.sh" \
766			block/disk \
767			"${bdfs_to_wait_for[@]}" &
768		sync_pid=$!
769	fi
770fi
771
772if [[ $os == Linux ]]; then
773	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
774		echo "${HUGEPGSZ}kB is not supported by the running kernel, ingoring" >&2
775		unset -v HUGEPGSZ
776	fi
777
778	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
779	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
780	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
781
782	if [ "$mode" == "config" ]; then
783		configure_linux
784	elif [ "$mode" == "cleanup" ]; then
785		cleanup_linux
786	elif [ "$mode" == "reset" ]; then
787		reset_linux
788	elif [ "$mode" == "status" ]; then
789		status_linux
790	elif [ "$mode" == "help" ]; then
791		usage $0
792	else
793		usage $0 "Invalid argument '$mode'"
794	fi
795else
796	if [ "$mode" == "config" ]; then
797		configure_freebsd
798	elif [ "$mode" == "reset" ]; then
799		reset_freebsd
800	elif [ "$mode" == "cleanup" ]; then
801		echo "setup.sh cleanup function not yet supported on $os"
802	elif [ "$mode" == "status" ]; then
803		status_freebsd
804	elif [ "$mode" == "help" ]; then
805		usage $0
806	else
807		usage $0 "Invalid argument '$mode'"
808	fi
809fi
810
811if [[ -e /proc/$sync_pid/status ]]; then
812	wait "$sync_pid"
813fi
814