xref: /spdk/scripts/setup.sh (revision b3bec07939ebe2ea2e0c43931705d32aa9e06719)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|help]"
22	else
23		options="[config|reset|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "help              Print this help message."
48	echo
49	echo "The following environment variables can be specified."
50	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
51	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
52	echo "                  default."
53	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
54	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
55	echo "                  Uses kernel's default for hugepages size."
56	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
57	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
58	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
59	echo "                  Hugepages can be defined per node with e.g.:"
60	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
61	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
62	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
63	echo "                  setting is used."
64	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
65	echo "                  number of requested hugepages is lower from what's already"
66	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
67	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
68	echo "                  be made prior to allocation".
69	echo "PCI_ALLOWED"
70	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
71	echo "                  Each device must be specified as a full PCI address."
72	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
73	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
74	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
75	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
76	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
77	echo "                  will be bound."
78	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
79	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
80	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
81	echo "                  By default the current user will be used."
82	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
83	echo "                  bind devices to the given driver."
84	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
85	echo "PCI_BLOCK_SYNC_ON_RESET"
86	echo "                  If set in the environment, the attempt to wait for block devices associated"
87	echo "                  with given PCI device will be made upon reset"
88	echo "UNBIND_ENTIRE_IOMMU_GROUP"
89	echo "                  If set, all devices from nvme's iommu group will be unbound from their drivers."
90	echo "                  Use with caution."
91	exit 0
92}
93
94# In monolithic kernels the lsmod won't work. So
95# back that with a /sys/modules. We also check
96# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
97# contain needed info (like in Fedora-like OS).
98function check_for_driver() {
99	if [[ -z $1 ]]; then
100		return 0
101	fi
102
103	if lsmod | grep -q ${1//-/_}; then
104		return 1
105	fi
106
107	if [[ -d /sys/module/${1} || -d \
108		/sys/module/${1//-/_} || -d \
109		/sys/bus/pci/drivers/${1} || -d \
110		/sys/bus/pci/drivers/${1//-/_} ]]; then
111		return 2
112	fi
113	return 0
114}
115
116function check_for_driver_freebsd() {
117	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
118	local search_paths path driver
119	IFS=";" read -ra search_paths < <(kldconfig -rU)
120
121	for driver in contigmem.ko nic_uio.ko; do
122		for path in "${search_paths[@]}"; do
123			[[ -f $path/$driver ]] && continue 2
124		done
125		return 1
126	done
127	return 0
128}
129
130function pci_dev_echo() {
131	local bdf="$1"
132	shift
133	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
134}
135
136function probe_driver() {
137	local bdf=$1
138	local driver_name=$2
139	old_driver_name=${drivers_d["$bdf"]:-no driver}
140
141	if [[ $driver_name == "$old_driver_name" ]]; then
142		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
143		return 0
144	fi
145
146	if [[ $old_driver_name != "no driver" ]]; then
147		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
148	fi
149
150	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
151
152	if [[ $driver_name == "none" ]]; then
153		return 0
154	fi
155
156	local probe_attempts=0
157	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
158	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
159		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
160		sleep 0.5
161	done 2> /dev/null
162
163	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
164
165	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
166		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
167		return 1
168	fi
169}
170
171function linux_bind_driver() {
172	local bdf="$1"
173	local driver_name="$2"
174
175	probe_driver "$bdf" "$driver_name"
176
177	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
178	if [ -e "/dev/vfio/$iommu_group" ]; then
179		if [ -n "$TARGET_USER" ]; then
180			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
181		fi
182	fi
183
184	local iommug=("/sys/bus/pci/devices/$bdf/iommu_group/devices/"!($bdf))
185	local _bdf _driver
186	if ((${#iommug[@]} > 0)) && [[ $driver_name == vfio* ]]; then
187		pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!"
188		for _bdf in "${iommug[@]}"; do
189			_driver=$(readlink -f "$_bdf/driver")
190			if [[ ! -e $_driver || ${_driver##*/} == "$driver_name" ]]; then
191				continue
192			fi
193			# See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device()
194			pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver##*/})"
195			pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound"
196			if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then
197				pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}"
198				drivers_d["${_bdf##*/}"]=${_driver##*/}
199				probe_driver "${_bdf##*/}" none
200			fi
201		done
202	fi
203
204}
205
206function linux_unbind_driver() {
207	local bdf="$1"
208	local old_driver_name=${drivers_d["$bdf"]:-no driver}
209
210	if [[ $old_driver_name == "no driver" ]]; then
211		pci_dev_echo "$bdf" "Not bound to any driver"
212		return 0
213	fi
214
215	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
216		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
217		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
218	fi
219
220	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
221}
222
223function linux_hugetlbfs_mounts() {
224	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
225}
226
227function get_block_dev_from_bdf() {
228	local bdf=$1
229	local block blocks=() ctrl
230
231	for block in /sys/block/*; do
232		if [[ $block == *nvme* ]]; then
233			ctrl=${block##*/} ctrl=${ctrl%n*}
234			if [[ -e /sys/class/nvme/$ctrl && $(< "/sys/class/nvme/$ctrl/address") == "$bdf" ]]; then
235				blocks+=("${block##*/}")
236			fi
237		elif [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
238			blocks+=("${block##*/}")
239		fi
240	done
241	printf '%s\n' "${blocks[@]}"
242}
243
244function get_used_bdf_block_devs() {
245	local bdf=$1
246	local blocks block blockp dev mount holder
247	local used
248
249	hash lsblk &> /dev/null || return 1
250	blocks=($(get_block_dev_from_bdf "$bdf"))
251
252	for block in "${blocks[@]}"; do
253		# Check if the device is hold by some other, regardless if it's mounted
254		# or not.
255		for holder in "/sys/class/block/$block"*/holders/*; do
256			[[ -e $holder ]] || continue
257			blockp=${holder%/holders*} blockp=${blockp##*/}
258			if [[ -e $holder/slaves/$blockp ]]; then
259				used+=("holder@$blockp:${holder##*/}")
260			fi
261		done
262		while read -r dev mount; do
263			if [[ -e $mount ]]; then
264				used+=("mount@$block:$dev")
265			fi
266		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
267		if ((${#used[@]} == 0)); then
268			# Make sure we check if there's any valid data present on the target device
269			# regardless if it's being actively used or not. This is mainly done to make
270			# sure we don't miss more complex setups like ZFS pools, etc.
271			if block_in_use "$block" > /dev/null; then
272				used+=("data@$block")
273			fi
274		fi
275	done
276
277	if ((${#used[@]} > 0)); then
278		printf '%s\n' "${used[@]}"
279	fi
280}
281
282function collect_devices() {
283	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
284
285	local ids dev_type dev_id bdf bdfs in_use driver
286
287	ids+="PCI_DEVICE_ID_INTEL_IOAT"
288	ids+="|PCI_DEVICE_ID_INTEL_DSA"
289	ids+="|PCI_DEVICE_ID_INTEL_IAA"
290	ids+="|PCI_DEVICE_ID_VIRTIO"
291	ids+="|PCI_DEVICE_ID_INTEL_VMD"
292	ids+="|SPDK_PCI_CLASS_NVME"
293
294	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
295
296	while read -r _ dev_type dev_id; do
297		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
298		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
299		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
300		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
301		for bdf in "${bdfs[@]}"; do
302			in_use=0
303			if [[ $1 != status ]]; then
304				if ! pci_can_use "$bdf"; then
305					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
306					in_use=1
307				fi
308				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
309					if ! verify_bdf_block_devs "$bdf"; then
310						in_use=1
311					fi
312				fi
313				if [[ $dev_type == vmd ]]; then
314					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
315						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
316						in_use=1
317					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
318						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
319							if [ "$mode" == "config" ]; then
320								cat <<- MESSAGE
321									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
322									which are attached to the kernel NVMe driver,the binding process may go faster
323									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
324									NVMe SSDs, and then run again to unbind the VMD devices."
325								MESSAGE
326							fi
327						fi
328					fi
329				fi
330			fi
331			eval "${dev_type}_d[$bdf]=$in_use"
332			all_devices_d["$bdf"]=$in_use
333			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
334				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
335				drivers_d["$bdf"]=${driver##*/}
336			fi
337		done
338	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
339}
340
341function collect_driver() {
342	local bdf=$1
343	local drivers driver
344
345	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
346		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
347		# Pick first entry in case multiple aliases are bound to a driver.
348		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
349		driver=${driver##*/}
350	else
351		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
352		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
353		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
354		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
355		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
356		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
357	fi 2> /dev/null
358	echo "$driver"
359}
360
361function verify_bdf_block_devs() {
362	local bdf=$1
363	local blknames
364	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
365
366	if ((${#blknames[@]} > 0)); then
367		local IFS=","
368		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
369		return 1
370	fi
371}
372
373function configure_linux_pci() {
374	local driver_path=""
375	driver_name=""
376	igb_uio_fallback=""
377
378	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
379		# igb_uio is a common driver to override with and it depends on uio.
380		modprobe uio || true
381		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
382			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
383		fi
384	fi
385
386	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
387		driver_name=none
388	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
389		driver_path="$DRIVER_OVERRIDE"
390		driver_name="${DRIVER_OVERRIDE##*/}"
391		# modprobe and the sysfs don't use the .ko suffix.
392		driver_name=${driver_name%.ko}
393		# path = name -> there is no path
394		if [[ "$driver_path" = "$driver_name" ]]; then
395			driver_path=""
396		fi
397	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
398	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
399	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
400		driver_name=vfio-pci
401		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
402		# should be done automatically by modprobe since this particular module should
403		# be a part of vfio-pci dependencies, however, on some distros, it seems that
404		# it's not the case. See #1689.
405		if modinfo vfio_iommu_type1 > /dev/null; then
406			modprobe vfio_iommu_type1
407		fi
408	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
409		driver_name=uio_pci_generic
410	elif [[ -e $igb_uio_fallback ]]; then
411		driver_path="$igb_uio_fallback"
412		driver_name="igb_uio"
413		echo "WARNING: uio_pci_generic not detected - using $driver_name"
414	else
415		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
416		return 1
417	fi
418
419	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
420	if [[ $driver_name != "none" ]]; then
421		if [[ -n "$driver_path" ]]; then
422			insmod $driver_path || true
423		else
424			modprobe $driver_name
425		fi
426	fi
427
428	for bdf in "${!all_devices_d[@]}"; do
429		if ((all_devices_d["$bdf"] == 0)); then
430			if [[ -n ${nvme_d["$bdf"]} ]]; then
431				# Some nvme controllers may take significant amount of time while being
432				# unbound from the driver. Put that task into background to speed up the
433				# whole process. Currently this is done only for the devices bound to the
434				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
435				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
436				linux_bind_driver "$bdf" "$driver_name" &
437			else
438				linux_bind_driver "$bdf" "$driver_name"
439			fi
440		fi
441	done
442	wait
443
444	echo "1" > "/sys/bus/pci/rescan"
445}
446
447function cleanup_linux() {
448	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
449	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
450
451	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
452	if [[ -d $XDG_RUNTIME_DIR ]]; then
453		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
454	fi
455
456	for dir in "${dirs_to_clean[@]}"; do
457		files_to_clean+=("$dir/"*)
458	done
459	file_locks+=(/var/tmp/spdk_pci_lock*)
460	file_locks+=(/var/tmp/spdk_cpu_lock*)
461
462	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
463	files_to_clean+=("${file_locks[@]}")
464
465	# This may fail in case path that readlink attempts to resolve suddenly
466	# disappears (as it may happen with terminating processes).
467	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
468
469	if ((${#opened_files[@]} == 0)); then
470		echo "Can't get list of opened files!"
471		exit 1
472	fi
473
474	echo 'Cleaning'
475	for f in "${files_to_clean[@]}"; do
476		[[ -e $f ]] || continue
477		if [[ ${opened_files[*]} != *"$f"* ]]; then
478			echo "Removing:    $f"
479			rm $f
480		else
481			echo "Still open: $f"
482		fi
483	done
484
485	for dir in "${dirs_to_clean[@]}"; do
486		[[ -d $dir ]] || continue
487		if [[ ${opened_files[*]} != *"$dir"* ]]; then
488			echo "Removing:    $dir"
489			rmdir $dir
490		else
491			echo "Still open: $dir"
492		fi
493	done
494	echo "Clean"
495}
496
497check_hugepages_alloc() {
498	local hp_int=$1
499	local allocated_hugepages
500
501	allocated_hugepages=$(< "$hp_int")
502
503	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
504		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
505		return 0
506	fi
507
508	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
509
510	allocated_hugepages=$(< "$hp_int")
511	if ((allocated_hugepages < NRHUGE)); then
512		cat <<- ERROR
513
514			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
515			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
516		ERROR
517		return 1
518	fi
519}
520
521clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
522
523configure_linux_hugepages() {
524	local node system_nodes
525	local nodes_to_use nodes_hp
526
527	if [[ $CLEAR_HUGE == yes ]]; then
528		clear_hugepages
529	fi
530
531	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
532		clear_hugepages
533		check_hugepages_alloc /proc/sys/vm/nr_hugepages
534		return 0
535	fi
536
537	for node in /sys/devices/system/node/node*; do
538		[[ -e $node ]] || continue
539		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
540	done
541
542	if ((${#nodes[@]} == 0)); then
543		# No NUMA support? Fallback to common interface
544		check_hugepages_alloc /proc/sys/vm/nr_hugepages
545		return 0
546	fi
547
548	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
549	if ((${#nodes_to_use[@]} == 0)); then
550		nodes_to_use[0]=0
551	fi
552
553	# Align indexes with node ids
554	for node in "${!nodes_to_use[@]}"; do
555		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
556			eval "${nodes_to_use[node]}"
557		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
558			nodes_hp[nodes_to_use[node]]=$NRHUGE
559		fi
560	done
561
562	for node in "${!nodes_hp[@]}"; do
563		if [[ -z ${nodes[node]} ]]; then
564			echo "Node $node doesn't exist, ignoring" >&2
565			continue
566		fi
567		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
568	done
569}
570
571function configure_linux() {
572	configure_linux_pci
573	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
574
575	if [ -z "$hugetlbfs_mounts" ]; then
576		hugetlbfs_mounts=/mnt/huge
577		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
578		mkdir -p "$hugetlbfs_mounts"
579		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
580	fi
581
582	configure_linux_hugepages
583
584	if [ "$driver_name" = "vfio-pci" ]; then
585		if [ -n "$TARGET_USER" ]; then
586			for mount in $hugetlbfs_mounts; do
587				chown "$TARGET_USER" "$mount"
588				chmod g+w "$mount"
589			done
590
591			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
592			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
593				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
594				cat <<- MEMLOCK
595					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
596
597					This is the maximum amount of memory you will be
598					able to use with DPDK and VFIO if run as user "$TARGET_USER".
599					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
600				MEMLOCK
601				if ((MEMLOCK_AMNT < 65536)); then
602					echo ""
603					echo "## WARNING: memlock limit is less than 64MB"
604					echo -n "## DPDK with VFIO may not be able to initialize "
605					echo "if run as user \"$TARGET_USER\"."
606				fi
607			fi
608		fi
609	fi
610
611	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
612		# Some distros build msr as a module.  Make sure it's loaded to ensure
613		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
614		#  sleeps.
615		modprobe msr &> /dev/null || true
616	fi
617}
618
619function reset_linux_pci() {
620	# virtio
621	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
622	# Requires some more investigation - for example, some kernels do not seem to have
623	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
624	#  underscore vs. dash right in the virtio_scsi name.
625	modprobe virtio-pci || true
626	for bdf in "${!all_devices_d[@]}"; do
627		((all_devices_d["$bdf"] == 0)) || continue
628
629		driver=$(collect_driver "$bdf")
630		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
631			linux_bind_driver "$bdf" "$driver"
632		else
633			linux_unbind_driver "$bdf"
634		fi
635	done
636
637	echo "1" > "/sys/bus/pci/rescan"
638}
639
640function reset_linux() {
641	reset_linux_pci
642	for mount in $(linux_hugetlbfs_mounts); do
643		for hp in "$mount"/spdk*map_*; do
644			flock -n "$hp" true && rm -f "$hp"
645		done
646	done
647	rm -f /run/.spdk*
648}
649
650function status_linux() {
651	echo "Hugepages" >&2
652	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
653
654	numa_nodes=0
655	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
656		numa_nodes=$((numa_nodes + 1))
657		free_pages=$(cat $path/free_hugepages)
658		all_pages=$(cat $path/nr_hugepages)
659
660		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
661
662		node=${BASH_REMATCH[1]}
663		huge_size=${BASH_REMATCH[2]}
664
665		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
666	done
667
668	# fall back to system-wide hugepages
669	if [ "$numa_nodes" = "0" ]; then
670		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
671		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
672		node="-"
673		huge_size="$HUGEPGSZ"
674
675		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
676	fi
677
678	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
679		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
680
681	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
682
683	for bdf in "${sorted_bdfs[@]}"; do
684		driver=${drivers_d["$bdf"]}
685		if [ "$numa_nodes" = "0" ]; then
686			node="-"
687		else
688			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
689			if ((node == -1)); then
690				node=unknown
691			fi
692		fi
693		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
694			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
695		else
696			name="-"
697		fi
698
699		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
700			blknames=($(get_block_dev_from_bdf "$bdf"))
701		else
702			blknames=("-")
703		fi
704
705		desc=""
706		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
707		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
708		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
709		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
710		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
711		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
712
713		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
714			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
715			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
716	done
717}
718
719function status_freebsd() {
720	local pci
721
722	status_print() (
723		local type=$1
724		local dev driver
725
726		shift
727
728		for pci; do
729			printf '%-8s %-15s %-6s %-6s %-16s\n' \
730				"$type" \
731				"$pci" \
732				"${pci_ids_vendor["$pci"]}" \
733				"${pci_ids_device["$pci"]}" \
734				"${pci_bus_driver["$pci"]}"
735		done | sort -k2,2
736	)
737
738	local contigmem=present
739	local contigmem_buffer_size
740	local contigmem_num_buffers
741
742	if ! kldstat -q -m contigmem; then
743		contigmem="not present"
744	fi
745	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
746		contigmem_buffer_size="not set"
747	fi
748	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
749		contigmem_num_buffers="not set"
750	fi
751
752	cat <<- BSD_INFO
753		Contigmem ($contigmem)
754		Buffer Size: $contigmem_buffer_size
755		Num Buffers: $contigmem_num_buffers
756
757	BSD_INFO
758
759	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
760		"Type" "BDF" "Vendor" "Device" "Driver" >&2
761
762	status_print "NVMe" "${!nvme_d[@]}"
763	status_print "I/OAT" "${!ioat_d[@]}"
764	status_print "DSA" "${!dsa_d[@]}"
765	status_print "IAA" "${!iaa_d[@]}"
766	status_print "VMD" "${!vmd_d[@]}"
767}
768
769function configure_freebsd_pci() {
770	local BDFS
771
772	BDFS+=("${!nvme_d[@]}")
773	BDFS+=("${!ioat_d[@]}")
774	BDFS+=("${!dsa_d[@]}")
775	BDFS+=("${!iaa_d[@]}")
776	BDFS+=("${!vmd_d[@]}")
777
778	# Drop the domain part from all the addresses
779	BDFS=("${BDFS[@]#*:}")
780
781	local IFS=","
782	kldunload nic_uio.ko || true
783	kenv hw.nic_uio.bdfs="${BDFS[*]}"
784	kldload nic_uio.ko
785}
786
787function configure_freebsd() {
788	if ! check_for_driver_freebsd; then
789		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
790		return 1
791	fi
792	configure_freebsd_pci
793	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
794	#  previous value, unload contigmem so that we can reload with the new value.
795	if kldstat -q -m contigmem; then
796		# contigmem may be loaded, but the kernel environment doesn't have to
797		# be necessarily set at this point. If it isn't, kenv will fail to
798		# pick up the hw. options. Handle it.
799		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
800			contigmem_num_buffers=-1
801		fi 2> /dev/null
802		if ((contigmem_num_buffers != HUGEMEM / 256)); then
803			kldunload contigmem.ko
804		fi
805	fi
806	if ! kldstat -q -m contigmem; then
807		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
808		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
809		kldload contigmem.ko
810	fi
811}
812
813function reset_freebsd() {
814	kldunload contigmem.ko || true
815	kldunload nic_uio.ko || true
816}
817
818CMD=reset cache_pci_bus
819
820mode=$1
821
822if [ -z "$mode" ]; then
823	mode="config"
824fi
825
826: ${HUGEMEM:=2048}
827: ${PCI_ALLOWED:=""}
828: ${PCI_BLOCKED:=""}
829
830if [ -n "$NVME_ALLOWED" ]; then
831	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
832fi
833
834if [ -n "$SKIP_PCI" ]; then
835	PCI_ALLOWED="none"
836fi
837
838if [ -z "$TARGET_USER" ]; then
839	TARGET_USER="$SUDO_USER"
840	if [ -z "$TARGET_USER" ]; then
841		TARGET_USER=$(logname 2> /dev/null) || true
842	fi
843fi
844
845collect_devices "$mode"
846
847if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
848	# Note that this will wait only for the first block device attached to
849	# a given storage controller. For nvme this may miss some of the devs
850	# in case multiple namespaces are being in place.
851	# FIXME: Wait for nvme controller(s) to be in live state and determine
852	# number of configured namespaces, build list of potential block devs
853	# and pass them to sync_dev_uevents. Is it worth the effort?
854	bdfs_to_wait_for=()
855	for bdf in "${!all_devices_d[@]}"; do
856		((all_devices_d["$bdf"] == 0)) || continue
857		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
858			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
859			bdfs_to_wait_for+=("$bdf")
860		fi
861	done
862	if ((${#bdfs_to_wait_for[@]} > 0)); then
863		echo "Waiting for block devices as requested"
864		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
865		"$rootdir/scripts/sync_dev_uevents.sh" \
866			block/disk \
867			"${bdfs_to_wait_for[@]}" &
868		sync_pid=$!
869	fi
870fi
871
872if [[ $os == Linux ]]; then
873	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
874		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
875		unset -v HUGEPGSZ
876	fi
877
878	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
879	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
880	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
881
882	if [ "$mode" == "config" ]; then
883		configure_linux
884	elif [ "$mode" == "cleanup" ]; then
885		cleanup_linux
886		clear_hugepages
887	elif [ "$mode" == "reset" ]; then
888		reset_linux
889	elif [ "$mode" == "status" ]; then
890		status_linux
891	elif [ "$mode" == "help" ]; then
892		usage $0
893	else
894		usage $0 "Invalid argument '$mode'"
895	fi
896else
897	if [ "$mode" == "config" ]; then
898		configure_freebsd
899	elif [ "$mode" == "reset" ]; then
900		reset_freebsd
901	elif [ "$mode" == "cleanup" ]; then
902		echo "setup.sh cleanup function not yet supported on $os"
903	elif [ "$mode" == "status" ]; then
904		status_freebsd
905	elif [ "$mode" == "help" ]; then
906		usage $0
907	else
908		usage $0 "Invalid argument '$mode'"
909	fi
910fi
911
912if [[ -e /proc/$sync_pid/status ]]; then
913	wait "$sync_pid"
914fi
915