xref: /spdk/scripts/setup.sh (revision 12fbe739a31b09aff0d05f354d4f3bbef99afc55)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|interactive|help]"
22	else
23		options="[config|reset|interactive|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "interactive       Executes script in interactive mode."
48	echo "help              Print this help message."
49	echo
50	echo "The following environment variables can be specified."
51	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
52	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
53	echo "                  default."
54	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
55	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
56	echo "                  Uses kernel's default for hugepages size."
57	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
58	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
59	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
60	echo "                  Hugepages can be defined per node with e.g.:"
61	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
62	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
63	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
64	echo "                  setting is used."
65	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
66	echo "                  number of requested hugepages is lower from what's already"
67	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
68	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
69	echo "                  be made prior to allocation".
70	echo "PCI_ALLOWED"
71	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
72	echo "                  Each device must be specified as a full PCI address."
73	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
74	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
75	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
76	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
77	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
78	echo "                  will be bound."
79	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
80	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
81	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
82	echo "                  By default the current user will be used."
83	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
84	echo "                  bind devices to the given driver."
85	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
86	echo "PCI_BLOCK_SYNC_ON_RESET"
87	echo "                  If set in the environment, the attempt to wait for block devices associated"
88	echo "                  with given PCI device will be made upon reset"
89	echo "UNBIND_ENTIRE_IOMMU_GROUP"
90	echo "                  If set, all devices from nvme's iommu group will be unbound from their drivers."
91	echo "                  Use with caution."
92	exit 0
93}
94
95# In monolithic kernels the lsmod won't work. So
96# back that with a /sys/modules. We also check
97# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
98# contain needed info (like in Fedora-like OS).
99function check_for_driver() {
100	if [[ -z $1 ]]; then
101		return 0
102	fi
103
104	if lsmod | grep -q ${1//-/_}; then
105		return 1
106	fi
107
108	if [[ -d /sys/module/${1} || -d \
109		/sys/module/${1//-/_} || -d \
110		/sys/bus/pci/drivers/${1} || -d \
111		/sys/bus/pci/drivers/${1//-/_} ]]; then
112		return 2
113	fi
114	return 0
115}
116
117function check_for_driver_freebsd() {
118	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
119	local search_paths path driver
120	IFS=";" read -ra search_paths < <(kldconfig -rU)
121
122	for driver in contigmem.ko nic_uio.ko; do
123		for path in "${search_paths[@]}"; do
124			[[ -f $path/$driver ]] && continue 2
125		done
126		return 1
127	done
128	return 0
129}
130
131function pci_dev_echo() {
132	local bdf="$1"
133	shift
134	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
135}
136
137function probe_driver() {
138	local bdf=$1
139	local driver_name=$2
140	old_driver_name=${drivers_d["$bdf"]:-no driver}
141
142	if [[ $driver_name == "$old_driver_name" ]]; then
143		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
144		return 0
145	fi
146
147	if [[ $old_driver_name != "no driver" ]]; then
148		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
149	fi
150
151	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
152
153	if [[ $driver_name == "none" ]]; then
154		return 0
155	fi
156
157	local probe_attempts=0
158	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
159	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
160		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
161		sleep 0.5
162	done 2> /dev/null
163
164	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
165
166	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
167		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
168		return 1
169	fi
170}
171
172function linux_bind_driver() {
173	local bdf="$1"
174	local driver_name="$2"
175
176	probe_driver "$bdf" "$driver_name"
177
178	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
179	if [ -e "/dev/vfio/$iommu_group" ]; then
180		if [ -n "$TARGET_USER" ]; then
181			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
182		fi
183	fi
184
185	local iommug=("/sys/bus/pci/devices/$bdf/iommu_group/devices/"!($bdf))
186	local _bdf _driver
187	if ((${#iommug[@]} > 0)) && [[ $driver_name == vfio* ]]; then
188		pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!"
189		for _bdf in "${iommug[@]}"; do
190			_driver=$(readlink -f "$_bdf/driver")
191			if [[ ! -e $_driver || ${_driver##*/} == "$driver_name" ]]; then
192				continue
193			fi
194			# See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device()
195			pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver##*/})"
196			pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound"
197			if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then
198				pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}"
199				drivers_d["${_bdf##*/}"]=${_driver##*/}
200				probe_driver "${_bdf##*/}" none
201			fi
202		done
203	fi
204
205}
206
207function linux_unbind_driver() {
208	local bdf="$1"
209	local old_driver_name=${drivers_d["$bdf"]:-no driver}
210
211	if [[ $old_driver_name == "no driver" ]]; then
212		pci_dev_echo "$bdf" "Not bound to any driver"
213		return 0
214	fi
215
216	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
217		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
218		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
219	fi
220
221	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
222}
223
224function linux_hugetlbfs_mounts() {
225	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
226}
227
228function get_block_dev_from_bdf() {
229	local bdf=$1
230	local block blocks=() ctrl
231
232	for block in /sys/block/*; do
233		if [[ $block == *nvme* ]]; then
234			ctrl=$(readlink -f "$block/device") ctrl=${ctrl##*/}
235			if [[ -e /sys/class/nvme/$ctrl && $(< "/sys/class/nvme/$ctrl/address") == "$bdf" ]]; then
236				blocks+=("${block##*/}")
237			fi
238		elif [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
239			blocks+=("${block##*/}")
240		fi
241	done
242	printf '%s\n' "${blocks[@]}"
243}
244
245function get_used_bdf_block_devs() {
246	local bdf=$1
247	local blocks block blockp dev mount holder
248	local used
249
250	hash lsblk &> /dev/null || return 1
251	blocks=($(get_block_dev_from_bdf "$bdf"))
252
253	for block in "${blocks[@]}"; do
254		# Check if the device is hold by some other, regardless if it's mounted
255		# or not.
256		for holder in "/sys/class/block/$block"*/holders/*; do
257			[[ -e $holder ]] || continue
258			blockp=${holder%/holders*} blockp=${blockp##*/}
259			if [[ -e $holder/slaves/$blockp ]]; then
260				used+=("holder@$blockp:${holder##*/}")
261			fi
262		done
263		while read -r dev mount; do
264			if [[ -e $mount ]]; then
265				used+=("mount@$block:$dev")
266			fi
267		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
268		if ((${#used[@]} == 0)); then
269			# Make sure we check if there's any valid data present on the target device
270			# regardless if it's being actively used or not. This is mainly done to make
271			# sure we don't miss more complex setups like ZFS pools, etc.
272			if block_in_use "$block" > /dev/null; then
273				used+=("data@$block")
274			fi
275		fi
276	done
277
278	if ((${#used[@]} > 0)); then
279		printf '%s\n' "${used[@]}"
280	fi
281}
282
283function collect_devices() {
284	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
285
286	local ids dev_type dev_id bdf bdfs in_use driver
287
288	ids+="PCI_DEVICE_ID_INTEL_IOAT"
289	ids+="|PCI_DEVICE_ID_INTEL_DSA"
290	ids+="|PCI_DEVICE_ID_INTEL_IAA"
291	ids+="|PCI_DEVICE_ID_VIRTIO"
292	ids+="|PCI_DEVICE_ID_INTEL_VMD"
293	ids+="|SPDK_PCI_CLASS_NVME"
294
295	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d types_d all_devices_type_d
296
297	while read -r _ dev_type dev_id; do
298		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
299		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
300		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
301		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
302		types_d["$dev_type"]=1
303		for bdf in "${bdfs[@]}"; do
304			in_use=0
305			if [[ $1 != status ]]; then
306				if ! pci_can_use "$bdf"; then
307					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
308					in_use=1
309				fi
310				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
311					if ! verify_bdf_block_devs "$bdf"; then
312						in_use=1
313					fi
314				fi
315				if [[ $dev_type == vmd ]]; then
316					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
317						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
318						in_use=1
319					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
320						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
321							if [ "$mode" == "config" ]; then
322								cat <<- MESSAGE
323									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
324									which are attached to the kernel NVMe driver,the binding process may go faster
325									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
326									NVMe SSDs, and then run again to unbind the VMD devices."
327								MESSAGE
328							fi
329						fi
330					fi
331				fi
332			fi
333			eval "${dev_type}_d[$bdf]=$in_use"
334			all_devices_d["$bdf"]=$in_use
335			all_devices_type_d["$bdf"]=$dev_type
336			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
337				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
338				drivers_d["$bdf"]=${driver##*/}
339			else
340				drivers_d["$bdf"]=""
341			fi
342		done
343	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
344}
345
346function collect_driver() {
347	local bdf=$1
348	local drivers driver
349
350	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
351		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
352		# Pick first entry in case multiple aliases are bound to a driver.
353		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
354		driver=${driver##*/}
355	else
356		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
357		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
358		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
359		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
360		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
361		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
362	fi 2> /dev/null
363	echo "$driver"
364}
365
366function verify_bdf_block_devs() {
367	local bdf=$1
368	local blknames
369	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
370
371	if ((${#blknames[@]} > 0)); then
372		local IFS=","
373		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
374		return 1
375	fi
376}
377
378function configure_linux_pci() {
379	local driver_path=""
380	driver_name=""
381	igb_uio_fallback=""
382
383	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
384		# igb_uio is a common driver to override with and it depends on uio.
385		modprobe uio || true
386		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
387			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
388		fi
389	fi
390
391	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
392		driver_name=none
393	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
394		driver_path="$DRIVER_OVERRIDE"
395		driver_name="${DRIVER_OVERRIDE##*/}"
396		# modprobe and the sysfs don't use the .ko suffix.
397		driver_name=${driver_name%.ko}
398		# path = name -> there is no path
399		if [[ "$driver_path" = "$driver_name" ]]; then
400			driver_path=""
401		fi
402	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
403	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
404	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
405		driver_name=vfio-pci
406		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
407		# should be done automatically by modprobe since this particular module should
408		# be a part of vfio-pci dependencies, however, on some distros, it seems that
409		# it's not the case. See #1689.
410		if modinfo vfio_iommu_type1 > /dev/null; then
411			modprobe vfio_iommu_type1
412		fi
413	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
414		driver_name=uio_pci_generic
415	elif [[ -e $igb_uio_fallback ]]; then
416		driver_path="$igb_uio_fallback"
417		driver_name="igb_uio"
418		echo "WARNING: uio_pci_generic not detected - using $driver_name"
419	else
420		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
421		return 1
422	fi
423
424	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
425	if [[ $driver_name != "none" ]]; then
426		if [[ -n "$driver_path" ]]; then
427			insmod $driver_path || true
428		else
429			modprobe $driver_name
430		fi
431	fi
432
433	for bdf in "${!all_devices_d[@]}"; do
434		if ((all_devices_d["$bdf"] == 0)); then
435			if [[ -n ${nvme_d["$bdf"]} ]]; then
436				# Some nvme controllers may take significant amount of time while being
437				# unbound from the driver. Put that task into background to speed up the
438				# whole process. Currently this is done only for the devices bound to the
439				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
440				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
441				linux_bind_driver "$bdf" "$driver_name" &
442			else
443				linux_bind_driver "$bdf" "$driver_name"
444			fi
445		fi
446	done
447	wait
448
449	echo "1" > "/sys/bus/pci/rescan"
450}
451
452function cleanup_linux() {
453	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
454	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
455
456	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
457	if [[ -d $XDG_RUNTIME_DIR ]]; then
458		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
459	fi
460
461	for dir in "${dirs_to_clean[@]}"; do
462		files_to_clean+=("$dir/"*)
463	done
464	file_locks+=(/var/tmp/spdk_pci_lock*)
465	file_locks+=(/var/tmp/spdk_cpu_lock*)
466
467	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
468	files_to_clean+=("${file_locks[@]}")
469
470	# This may fail in case path that readlink attempts to resolve suddenly
471	# disappears (as it may happen with terminating processes).
472	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
473
474	if ((${#opened_files[@]} == 0)); then
475		echo "Can't get list of opened files!"
476		exit 1
477	fi
478
479	echo 'Cleaning'
480	for f in "${files_to_clean[@]}"; do
481		[[ -e $f ]] || continue
482		if [[ ${opened_files[*]} != *"$f"* ]]; then
483			echo "Removing:    $f"
484			rm $f
485		else
486			echo "Still open: $f"
487		fi
488	done
489
490	for dir in "${dirs_to_clean[@]}"; do
491		[[ -d $dir ]] || continue
492		if [[ ${opened_files[*]} != *"$dir"* ]]; then
493			echo "Removing:    $dir"
494			rmdir $dir
495		else
496			echo "Still open: $dir"
497		fi
498	done
499	echo "Clean"
500}
501
502check_hugepages_alloc() {
503	local hp_int=$1
504	local allocated_hugepages
505
506	allocated_hugepages=$(< "$hp_int")
507
508	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
509		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
510		return 0
511	fi
512
513	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
514
515	allocated_hugepages=$(< "$hp_int")
516	if ((allocated_hugepages < NRHUGE)); then
517		cat <<- ERROR
518
519			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
520			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
521		ERROR
522		return 1
523	fi
524}
525
526clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
527
528configure_linux_hugepages() {
529	local node system_nodes
530	local nodes_to_use nodes_hp
531
532	if [[ $CLEAR_HUGE == yes ]]; then
533		clear_hugepages
534	fi
535
536	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
537		clear_hugepages
538		check_hugepages_alloc /proc/sys/vm/nr_hugepages
539		return 0
540	fi
541
542	for node in /sys/devices/system/node/node*; do
543		[[ -e $node ]] || continue
544		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
545	done
546
547	if ((${#nodes[@]} == 0)); then
548		# No NUMA support? Fallback to common interface
549		check_hugepages_alloc /proc/sys/vm/nr_hugepages
550		return 0
551	fi
552
553	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
554	if ((${#nodes_to_use[@]} == 0)); then
555		nodes_to_use[0]=0
556	fi
557
558	# Align indexes with node ids
559	for node in "${!nodes_to_use[@]}"; do
560		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
561			eval "${nodes_to_use[node]}"
562		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
563			nodes_hp[nodes_to_use[node]]=$NRHUGE
564		fi
565	done
566
567	for node in "${!nodes_hp[@]}"; do
568		if [[ -z ${nodes[node]} ]]; then
569			echo "Node $node doesn't exist, ignoring" >&2
570			continue
571		fi
572		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
573	done
574}
575
576function configure_linux() {
577	configure_linux_pci
578	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
579
580	if [ -z "$hugetlbfs_mounts" ]; then
581		hugetlbfs_mounts=/mnt/huge
582		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
583		mkdir -p "$hugetlbfs_mounts"
584		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
585	fi
586
587	configure_linux_hugepages
588
589	if [ "$driver_name" = "vfio-pci" ]; then
590		if [ -n "$TARGET_USER" ]; then
591			for mount in $hugetlbfs_mounts; do
592				chown "$TARGET_USER" "$mount"
593				chmod g+w "$mount"
594			done
595
596			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
597			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
598				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
599				cat <<- MEMLOCK
600					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
601
602					This is the maximum amount of memory you will be
603					able to use with DPDK and VFIO if run as user "$TARGET_USER".
604					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
605				MEMLOCK
606				if ((MEMLOCK_AMNT < 65536)); then
607					echo ""
608					echo "## WARNING: memlock limit is less than 64MB"
609					echo -n "## DPDK with VFIO may not be able to initialize "
610					echo "if run as user \"$TARGET_USER\"."
611				fi
612			fi
613		fi
614	fi
615
616	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
617		# Some distros build msr as a module.  Make sure it's loaded to ensure
618		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
619		#  sleeps.
620		modprobe msr &> /dev/null || true
621	fi
622}
623
624function reset_linux_pci() {
625	# virtio
626	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
627	# Requires some more investigation - for example, some kernels do not seem to have
628	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
629	#  underscore vs. dash right in the virtio_scsi name.
630	modprobe virtio-pci || true
631	for bdf in "${!all_devices_d[@]}"; do
632		((all_devices_d["$bdf"] == 0)) || continue
633
634		driver=$(collect_driver "$bdf")
635		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
636			linux_bind_driver "$bdf" "$driver"
637		else
638			linux_unbind_driver "$bdf"
639		fi
640	done
641
642	echo "1" > "/sys/bus/pci/rescan"
643}
644
645function reset_linux() {
646	reset_linux_pci
647	for mount in $(linux_hugetlbfs_mounts); do
648		for hp in "$mount"/spdk*map_*; do
649			flock -n "$hp" true && rm -f "$hp"
650		done
651	done
652	rm -f /run/.spdk*
653}
654
655function status_linux() {
656	echo "Hugepages" >&2
657	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
658
659	numa_nodes=0
660	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
661		numa_nodes=$((numa_nodes + 1))
662		free_pages=$(cat $path/free_hugepages)
663		all_pages=$(cat $path/nr_hugepages)
664
665		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
666
667		node=${BASH_REMATCH[1]}
668		huge_size=${BASH_REMATCH[2]}
669
670		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
671	done
672
673	# fall back to system-wide hugepages
674	if [ "$numa_nodes" = "0" ]; then
675		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
676		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
677		node="-"
678		huge_size="$HUGEPGSZ"
679
680		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
681	fi
682
683	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
684		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
685
686	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
687
688	for bdf in "${sorted_bdfs[@]}"; do
689		driver=${drivers_d["$bdf"]}
690		if [ "$numa_nodes" = "0" ]; then
691			node="-"
692		else
693			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
694			if ((node == -1)); then
695				node=unknown
696			fi
697		fi
698		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
699			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
700		else
701			name="-"
702		fi
703
704		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
705			blknames=($(get_block_dev_from_bdf "$bdf"))
706		else
707			blknames=("-")
708		fi
709
710		desc=""
711		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
712		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
713		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
714		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
715		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
716		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
717
718		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
719			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
720			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
721	done
722}
723
724function status_freebsd() {
725	local pci
726
727	status_print() (
728		local type=$1
729		local dev driver
730
731		shift
732
733		for pci; do
734			printf '%-8s %-15s %-6s %-6s %-16s\n' \
735				"$type" \
736				"$pci" \
737				"${pci_ids_vendor["$pci"]}" \
738				"${pci_ids_device["$pci"]}" \
739				"${pci_bus_driver["$pci"]}"
740		done | sort -k2,2
741	)
742
743	local contigmem=present
744	local contigmem_buffer_size
745	local contigmem_num_buffers
746
747	if ! kldstat -q -m contigmem; then
748		contigmem="not present"
749	fi
750	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
751		contigmem_buffer_size="not set"
752	fi
753	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
754		contigmem_num_buffers="not set"
755	fi
756
757	cat <<- BSD_INFO
758		Contigmem ($contigmem)
759		Buffer Size: $contigmem_buffer_size
760		Num Buffers: $contigmem_num_buffers
761
762	BSD_INFO
763
764	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
765		"Type" "BDF" "Vendor" "Device" "Driver" >&2
766
767	status_print "NVMe" "${!nvme_d[@]}"
768	status_print "I/OAT" "${!ioat_d[@]}"
769	status_print "DSA" "${!dsa_d[@]}"
770	status_print "IAA" "${!iaa_d[@]}"
771	status_print "VMD" "${!vmd_d[@]}"
772}
773
774function configure_freebsd_pci() {
775	local BDFS
776
777	BDFS+=("${!nvme_d[@]}")
778	BDFS+=("${!ioat_d[@]}")
779	BDFS+=("${!dsa_d[@]}")
780	BDFS+=("${!iaa_d[@]}")
781	BDFS+=("${!vmd_d[@]}")
782
783	# Drop the domain part from all the addresses
784	BDFS=("${BDFS[@]#*:}")
785
786	local IFS=","
787	kldunload nic_uio.ko || true
788	kenv hw.nic_uio.bdfs="${BDFS[*]}"
789	kldload nic_uio.ko
790}
791
792function configure_freebsd() {
793	if ! check_for_driver_freebsd; then
794		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
795		return 1
796	fi
797	configure_freebsd_pci
798	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
799	#  previous value, unload contigmem so that we can reload with the new value.
800	if kldstat -q -m contigmem; then
801		# contigmem may be loaded, but the kernel environment doesn't have to
802		# be necessarily set at this point. If it isn't, kenv will fail to
803		# pick up the hw. options. Handle it.
804		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
805			contigmem_num_buffers=-1
806		fi 2> /dev/null
807		if ((contigmem_num_buffers != HUGEMEM / 256)); then
808			kldunload contigmem.ko
809		fi
810	fi
811	if ! kldstat -q -m contigmem; then
812		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
813		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
814		kldload contigmem.ko
815	fi
816}
817
818function reset_freebsd() {
819	kldunload contigmem.ko || true
820	kldunload nic_uio.ko || true
821}
822
823function set_hp() {
824	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
825		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
826		unset -v HUGEPGSZ
827	fi
828
829	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
830	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
831	NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
832}
833
834CMD=reset cache_pci_bus
835
836mode=$1
837
838if [ -z "$mode" ]; then
839	mode="config"
840fi
841
842: ${HUGEMEM:=2048}
843: ${PCI_ALLOWED:=""}
844: ${PCI_BLOCKED:=""}
845
846if [ -n "$NVME_ALLOWED" ]; then
847	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
848fi
849
850if [ -n "$SKIP_PCI" ]; then
851	PCI_ALLOWED="none"
852fi
853
854if [ -z "$TARGET_USER" ]; then
855	TARGET_USER="$SUDO_USER"
856	if [ -z "$TARGET_USER" ]; then
857		TARGET_USER=$(logname 2> /dev/null) || true
858	fi
859fi
860
861collect_devices "$mode"
862
863if [[ $os == Linux ]]; then
864	set_hp
865fi
866
867if [[ $mode == interactive ]]; then
868	source "$rootdir/scripts/common/setup/interactive.sh"
869	main_menu "$2" || exit 0
870fi
871
872if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
873	# Note that this will wait only for the first block device attached to
874	# a given storage controller. For nvme this may miss some of the devs
875	# in case multiple namespaces are being in place.
876	# FIXME: Wait for nvme controller(s) to be in live state and determine
877	# number of configured namespaces, build list of potential block devs
878	# and pass them to sync_dev_uevents. Is it worth the effort?
879	bdfs_to_wait_for=()
880	for bdf in "${!all_devices_d[@]}"; do
881		((all_devices_d["$bdf"] == 0)) || continue
882		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
883			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
884			bdfs_to_wait_for+=("$bdf")
885		fi
886	done
887	if ((${#bdfs_to_wait_for[@]} > 0)); then
888		echo "Waiting for block devices as requested"
889		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
890		"$rootdir/scripts/sync_dev_uevents.sh" \
891			block/disk \
892			"${bdfs_to_wait_for[@]}" &
893		sync_pid=$!
894	fi
895fi
896
897if [[ $os == Linux ]]; then
898	if [ "$mode" == "config" ]; then
899		configure_linux
900	elif [ "$mode" == "cleanup" ]; then
901		cleanup_linux
902		clear_hugepages
903	elif [ "$mode" == "reset" ]; then
904		reset_linux
905	elif [ "$mode" == "status" ]; then
906		status_linux
907	elif [ "$mode" == "help" ]; then
908		usage $0
909	else
910		usage $0 "Invalid argument '$mode'"
911	fi
912else
913	if [ "$mode" == "config" ]; then
914		configure_freebsd
915	elif [ "$mode" == "reset" ]; then
916		reset_freebsd
917	elif [ "$mode" == "cleanup" ]; then
918		echo "setup.sh cleanup function not yet supported on $os"
919	elif [ "$mode" == "status" ]; then
920		status_freebsd
921	elif [ "$mode" == "help" ]; then
922		usage $0
923	else
924		usage $0 "Invalid argument '$mode'"
925	fi
926fi
927
928if [[ -e /proc/$sync_pid/status ]]; then
929	wait "$sync_pid"
930fi
931