xref: /spdk/scripts/setup.sh (revision a6dbe3721eb3b5990707fc3e378c95e505dd8ab5)
1 #!/usr/bin/env bash
2 
3 set -e
4 shopt -s nullglob extglob
5 
6 os=$(uname -s)
7 
8 if [[ $os != Linux && $os != FreeBSD ]]; then
9 	echo "Not supported platform ($os), aborting"
10 	exit 1
11 fi
12 
13 rootdir=$(readlink -f $(dirname $0))/..
14 source "$rootdir/scripts/common.sh"
15 
16 function usage() {
17 	if [[ $os == Linux ]]; then
18 		options="[config|reset|status|cleanup|help]"
19 	else
20 		options="[config|reset|help]"
21 	fi
22 
23 	[[ -n $2 ]] && (
24 		echo "$2"
25 		echo ""
26 	)
27 	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
28 	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
29 	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
30 	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
31 	echo "Usage: $(basename $1) $options"
32 	echo
33 	echo "$options - as following:"
34 	echo "config            Default mode. Allocate hugepages and bind PCI devices."
35 	if [[ $os == Linux ]]; then
36 		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
37 	fi
38 	echo "reset             Rebind PCI devices back to their original drivers."
39 	echo "                  Also cleanup any leftover spdk files/resources."
40 	echo "                  Hugepage memory size will remain unchanged."
41 	if [[ $os == Linux ]]; then
42 		echo "status            Print status of all SPDK-compatible devices on the system."
43 	fi
44 	echo "help              Print this help message."
45 	echo
46 	echo "The following environment variables can be specified."
47 	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
48 	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
49 	echo "                  default."
50 	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
51 	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
52 	echo "                  Uses kernel's default for hugepages size."
53 	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
54 	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
55 	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
56 	echo "                  Hugepages can be defined per node with e.g.:"
57 	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
58 	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
59 	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
60 	echo "                  setting is used."
61 	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
62 	echo "                  number of requested hugepages is lower from what's already"
63 	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
64 	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
65 	echo "                  be made prior to allocation".
66 	echo "PCI_ALLOWED"
67 	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
68 	echo "                  Each device must be specified as a full PCI address."
69 	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
70 	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
71 	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
72 	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
73 	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
74 	echo "                  will be bound."
75 	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
76 	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
77 	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
78 	echo "                  By default the current user will be used."
79 	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
80 	echo "                  bind devices to the given driver."
81 	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
82 	echo "PCI_BLOCK_SYNC_ON_RESET"
83 	echo "                  If set in the environment, the attempt to wait for block devices associated"
84 	echo "                  with given PCI device will be made upon reset"
85 	exit 0
86 }
87 
88 # In monolithic kernels the lsmod won't work. So
89 # back that with a /sys/modules. We also check
90 # /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
91 # contain needed info (like in Fedora-like OS).
92 function check_for_driver() {
93 	if [[ -z $1 ]]; then
94 		return 0
95 	fi
96 
97 	if lsmod | grep -q ${1//-/_}; then
98 		return 1
99 	fi
100 
101 	if [[ -d /sys/module/${1} || -d \
102 		/sys/module/${1//-/_} || -d \
103 		/sys/bus/pci/drivers/${1} || -d \
104 		/sys/bus/pci/drivers/${1//-/_} ]]; then
105 		return 2
106 	fi
107 	return 0
108 }
109 
110 function check_for_driver_freebsd() {
111 	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
112 	local search_paths path driver
113 	IFS=";" read -ra search_paths < <(kldconfig -rU)
114 
115 	for driver in contigmem.ko nic_uio.ko; do
116 		for path in "${search_paths[@]}"; do
117 			[[ -f $path/$driver ]] && continue 2
118 		done
119 		return 1
120 	done
121 	return 0
122 }
123 
124 function pci_dev_echo() {
125 	local bdf="$1"
126 	shift
127 	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
128 }
129 
130 function linux_bind_driver() {
131 	bdf="$1"
132 	driver_name="$2"
133 	old_driver_name=${drivers_d["$bdf"]:-no driver}
134 	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
135 
136 	if [[ $driver_name == "$old_driver_name" ]]; then
137 		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
138 		return 0
139 	fi
140 
141 	if [[ $old_driver_name != "no driver" ]]; then
142 		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
143 		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
144 	fi
145 
146 	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
147 
148 	if [[ $driver_name == "none" ]]; then
149 		return 0
150 	fi
151 
152 	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
153 	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
154 
155 	if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then
156 		# Check if the uio_pci_generic driver is broken as it might be in
157 		# some 4.18.x kernels (see centos8 for instance) - if our device
158 		# didn't get a proper uio entry, fallback to igb_uio
159 		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
160 			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
161 			drivers_d["$bdf"]="no driver"
162 			# This call will override $driver_name for remaining devices as well
163 			linux_bind_driver "$bdf" igb_uio
164 		fi
165 	fi
166 
167 	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
168 	if [ -e "/dev/vfio/$iommu_group" ]; then
169 		if [ -n "$TARGET_USER" ]; then
170 			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
171 		fi
172 	fi
173 }
174 
175 function linux_unbind_driver() {
176 	local bdf="$1"
177 	local ven_dev_id
178 	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
179 	local old_driver_name=${drivers_d["$bdf"]:-no driver}
180 
181 	if [[ $old_driver_name == "no driver" ]]; then
182 		pci_dev_echo "$bdf" "Not bound to any driver"
183 		return 0
184 	fi
185 
186 	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
187 		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
188 		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
189 	fi
190 
191 	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
192 }
193 
194 function linux_hugetlbfs_mounts() {
195 	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
196 }
197 
198 function get_block_dev_from_bdf() {
199 	local bdf=$1
200 	local block
201 
202 	for block in /sys/block/*; do
203 		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
204 			echo "${block##*/}"
205 		fi
206 	done
207 }
208 
209 function get_used_bdf_block_devs() {
210 	local bdf=$1
211 	local blocks block blockp dev mount holder
212 	local used
213 
214 	hash lsblk &> /dev/null || return 1
215 	blocks=($(get_block_dev_from_bdf "$bdf"))
216 
217 	for block in "${blocks[@]}"; do
218 		# Check if the device is hold by some other, regardless if it's mounted
219 		# or not.
220 		for holder in "/sys/class/block/$block"*/holders/*; do
221 			[[ -e $holder ]] || continue
222 			blockp=${holder%/holders*} blockp=${blockp##*/}
223 			if [[ -e $holder/slaves/$blockp ]]; then
224 				used+=("holder@$blockp:${holder##*/}")
225 			fi
226 		done
227 		while read -r dev mount; do
228 			if [[ -e $mount ]]; then
229 				used+=("mount@$block:$dev")
230 			fi
231 		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
232 		if ((${#used[@]} == 0)); then
233 			# Make sure we check if there's any valid data present on the target device
234 			# regardless if it's being actively used or not. This is mainly done to make
235 			# sure we don't miss more complex setups like ZFS pools, etc.
236 			if block_in_use "$block" > /dev/null; then
237 				used+=("data@$block")
238 			fi
239 		fi
240 	done
241 
242 	if ((${#used[@]} > 0)); then
243 		printf '%s\n' "${used[@]}"
244 	fi
245 }
246 
247 function collect_devices() {
248 	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
249 
250 	local ids dev_type dev_id bdf bdfs in_use driver
251 
252 	ids+="PCI_DEVICE_ID_INTEL_IOAT"
253 	ids+="|PCI_DEVICE_ID_INTEL_DSA"
254 	ids+="|PCI_DEVICE_ID_INTEL_IAA"
255 	ids+="|PCI_DEVICE_ID_VIRTIO"
256 	ids+="|PCI_DEVICE_ID_INTEL_VMD"
257 	ids+="|SPDK_PCI_CLASS_NVME"
258 
259 	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
260 
261 	while read -r _ dev_type dev_id; do
262 		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
263 		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
264 		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
265 		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
266 		for bdf in "${bdfs[@]}"; do
267 			in_use=0
268 			if [[ $1 != status ]]; then
269 				if ! pci_can_use "$bdf"; then
270 					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
271 					in_use=1
272 				fi
273 				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
274 					if ! verify_bdf_block_devs "$bdf"; then
275 						in_use=1
276 					fi
277 				fi
278 				if [[ $dev_type == vmd ]]; then
279 					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
280 						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
281 						in_use=1
282 					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
283 						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
284 							if [ "$mode" == "config" ]; then
285 								cat <<- MESSAGE
286 									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
287 									which are attached to the kernel NVMe driver,the binding process may go faster
288 									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
289 									NVMe SSDs, and then run again to unbind the VMD devices."
290 								MESSAGE
291 							fi
292 						fi
293 					fi
294 				fi
295 			fi
296 			eval "${dev_type}_d[$bdf]=$in_use"
297 			all_devices_d["$bdf"]=$in_use
298 			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
299 				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
300 				drivers_d["$bdf"]=${driver##*/}
301 			fi
302 		done
303 	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
304 }
305 
306 function collect_driver() {
307 	local bdf=$1
308 	local drivers driver
309 
310 	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
311 		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
312 		# Pick first entry in case multiple aliases are bound to a driver.
313 		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
314 		driver=${driver##*/}
315 	else
316 		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
317 		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
318 		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
319 		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
320 		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
321 		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
322 	fi 2> /dev/null
323 	echo "$driver"
324 }
325 
326 function verify_bdf_block_devs() {
327 	local bdf=$1
328 	local blknames
329 	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
330 
331 	if ((${#blknames[@]} > 0)); then
332 		local IFS=","
333 		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
334 		return 1
335 	fi
336 }
337 
338 function configure_linux_pci() {
339 	local driver_path=""
340 	driver_name=""
341 	igb_uio_fallback=""
342 
343 	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
344 		# igb_uio is a common driver to override with and it depends on uio.
345 		modprobe uio || true
346 		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
347 			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
348 		fi
349 	fi
350 
351 	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
352 		driver_name=none
353 	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
354 		driver_path="$DRIVER_OVERRIDE"
355 		driver_name="${DRIVER_OVERRIDE##*/}"
356 		# modprobe and the sysfs don't use the .ko suffix.
357 		driver_name=${driver_name%.ko}
358 		# path = name -> there is no path
359 		if [[ "$driver_path" = "$driver_name" ]]; then
360 			driver_path=""
361 		fi
362 	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
363 	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
364 	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
365 		driver_name=vfio-pci
366 		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
367 		# should be done automatically by modprobe since this particular module should
368 		# be a part of vfio-pci dependencies, however, on some distros, it seems that
369 		# it's not the case. See #1689.
370 		if modinfo vfio_iommu_type1 > /dev/null; then
371 			modprobe vfio_iommu_type1
372 		fi
373 	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
374 		driver_name=uio_pci_generic
375 	elif [[ -e $igb_uio_fallback ]]; then
376 		driver_path="$igb_uio_fallback"
377 		driver_name="igb_uio"
378 		echo "WARNING: uio_pci_generic not detected - using $driver_name"
379 	else
380 		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
381 		return 1
382 	fi
383 
384 	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
385 	if [[ $driver_name != "none" ]]; then
386 		if [[ -n "$driver_path" ]]; then
387 			insmod $driver_path || true
388 		else
389 			modprobe $driver_name
390 		fi
391 	fi
392 
393 	for bdf in "${!all_devices_d[@]}"; do
394 		if ((all_devices_d["$bdf"] == 0)); then
395 			if [[ -n ${nvme_d["$bdf"]} ]]; then
396 				# Some nvme controllers may take significant amount of time while being
397 				# unbound from the driver. Put that task into background to speed up the
398 				# whole process. Currently this is done only for the devices bound to the
399 				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
400 				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
401 				linux_bind_driver "$bdf" "$driver_name" &
402 			else
403 				linux_bind_driver "$bdf" "$driver_name"
404 			fi
405 		fi
406 	done
407 	wait
408 
409 	echo "1" > "/sys/bus/pci/rescan"
410 }
411 
412 function cleanup_linux() {
413 	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
414 	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
415 
416 	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
417 	if [[ -d $XDG_RUNTIME_DIR ]]; then
418 		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
419 	fi
420 
421 	for dir in "${dirs_to_clean[@]}"; do
422 		files_to_clean+=("$dir/"*)
423 	done
424 	file_locks+=(/var/tmp/spdk_pci_lock*)
425 	file_locks+=(/var/tmp/spdk_cpu_lock*)
426 
427 	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
428 	files_to_clean+=("${file_locks[@]}")
429 
430 	# This may fail in case path that readlink attempts to resolve suddenly
431 	# disappears (as it may happen with terminating processes).
432 	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
433 
434 	if ((${#opened_files[@]} == 0)); then
435 		echo "Can't get list of opened files!"
436 		exit 1
437 	fi
438 
439 	echo 'Cleaning'
440 	for f in "${files_to_clean[@]}"; do
441 		[[ -e $f ]] || continue
442 		if [[ ${opened_files[*]} != *"$f"* ]]; then
443 			echo "Removing:    $f"
444 			rm $f
445 		else
446 			echo "Still open: $f"
447 		fi
448 	done
449 
450 	for dir in "${dirs_to_clean[@]}"; do
451 		[[ -d $dir ]] || continue
452 		if [[ ${opened_files[*]} != *"$dir"* ]]; then
453 			echo "Removing:    $dir"
454 			rmdir $dir
455 		else
456 			echo "Still open: $dir"
457 		fi
458 	done
459 	echo "Clean"
460 }
461 
462 check_hugepages_alloc() {
463 	local hp_int=$1
464 	local allocated_hugepages
465 
466 	allocated_hugepages=$(< "$hp_int")
467 
468 	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
469 		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
470 		return 0
471 	fi
472 
473 	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
474 
475 	allocated_hugepages=$(< "$hp_int")
476 	if ((allocated_hugepages < NRHUGE)); then
477 		cat <<- ERROR
478 
479 			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
480 			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
481 		ERROR
482 		return 1
483 	fi
484 }
485 
486 clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
487 
488 configure_linux_hugepages() {
489 	local node system_nodes
490 	local nodes_to_use nodes_hp
491 
492 	if [[ $CLEAR_HUGE == yes ]]; then
493 		clear_hugepages
494 	fi
495 
496 	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
497 		clear_hugepages
498 		check_hugepages_alloc /proc/sys/vm/nr_hugepages
499 		return 0
500 	fi
501 
502 	for node in /sys/devices/system/node/node*; do
503 		[[ -e $node ]] || continue
504 		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
505 	done
506 
507 	if ((${#nodes[@]} == 0)); then
508 		# No NUMA support? Fallback to common interface
509 		check_hugepages_alloc /proc/sys/vm/nr_hugepages
510 		return 0
511 	fi
512 
513 	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
514 	if ((${#nodes_to_use[@]} == 0)); then
515 		nodes_to_use[0]=0
516 	fi
517 
518 	# Align indexes with node ids
519 	for node in "${!nodes_to_use[@]}"; do
520 		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
521 			eval "${nodes_to_use[node]}"
522 		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
523 			nodes_hp[nodes_to_use[node]]=$NRHUGE
524 		fi
525 	done
526 
527 	for node in "${!nodes_hp[@]}"; do
528 		if [[ -z ${nodes[node]} ]]; then
529 			echo "Node $node doesn't exist, ignoring" >&2
530 			continue
531 		fi
532 		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
533 	done
534 }
535 
536 function configure_linux() {
537 	configure_linux_pci
538 	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
539 
540 	if [ -z "$hugetlbfs_mounts" ]; then
541 		hugetlbfs_mounts=/mnt/huge
542 		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
543 		mkdir -p "$hugetlbfs_mounts"
544 		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
545 	fi
546 
547 	configure_linux_hugepages
548 
549 	if [ "$driver_name" = "vfio-pci" ]; then
550 		if [ -n "$TARGET_USER" ]; then
551 			for mount in $hugetlbfs_mounts; do
552 				chown "$TARGET_USER" "$mount"
553 				chmod g+w "$mount"
554 			done
555 
556 			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
557 			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
558 				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
559 				cat <<- MEMLOCK
560 					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
561 
562 					This is the maximum amount of memory you will be
563 					able to use with DPDK and VFIO if run as user "$TARGET_USER".
564 					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
565 				MEMLOCK
566 				if ((MEMLOCK_AMNT < 65536)); then
567 					echo ""
568 					echo "## WARNING: memlock limit is less than 64MB"
569 					echo -n "## DPDK with VFIO may not be able to initialize "
570 					echo "if run as user \"$TARGET_USER\"."
571 				fi
572 			fi
573 		fi
574 	fi
575 
576 	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
577 		# Some distros build msr as a module.  Make sure it's loaded to ensure
578 		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
579 		#  sleeps.
580 		modprobe msr &> /dev/null || true
581 	fi
582 }
583 
584 function reset_linux_pci() {
585 	# virtio
586 	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
587 	# Requires some more investigation - for example, some kernels do not seem to have
588 	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
589 	#  underscore vs. dash right in the virtio_scsi name.
590 	modprobe virtio-pci || true
591 	for bdf in "${!all_devices_d[@]}"; do
592 		((all_devices_d["$bdf"] == 0)) || continue
593 
594 		driver=$(collect_driver "$bdf")
595 		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
596 			linux_bind_driver "$bdf" "$driver"
597 		else
598 			linux_unbind_driver "$bdf"
599 		fi
600 	done
601 
602 	echo "1" > "/sys/bus/pci/rescan"
603 }
604 
605 function reset_linux() {
606 	reset_linux_pci
607 	for mount in $(linux_hugetlbfs_mounts); do
608 		for hp in "$mount"/spdk*map_*; do
609 			flock -n "$hp" true && rm -f "$hp"
610 		done
611 	done
612 	rm -f /run/.spdk*
613 }
614 
615 function status_linux() {
616 	echo "Hugepages" >&2
617 	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
618 
619 	numa_nodes=0
620 	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
621 		numa_nodes=$((numa_nodes + 1))
622 		free_pages=$(cat $path/free_hugepages)
623 		all_pages=$(cat $path/nr_hugepages)
624 
625 		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
626 
627 		node=${BASH_REMATCH[1]}
628 		huge_size=${BASH_REMATCH[2]}
629 
630 		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
631 	done
632 
633 	# fall back to system-wide hugepages
634 	if [ "$numa_nodes" = "0" ]; then
635 		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
636 		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
637 		node="-"
638 		huge_size="$HUGEPGSZ"
639 
640 		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
641 	fi
642 
643 	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
644 		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
645 
646 	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
647 
648 	for bdf in "${sorted_bdfs[@]}"; do
649 		driver=${drivers_d["$bdf"]}
650 		if [ "$numa_nodes" = "0" ]; then
651 			node="-"
652 		else
653 			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
654 			if ((node == -1)); then
655 				node=unknown
656 			fi
657 		fi
658 		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
659 			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
660 		else
661 			name="-"
662 		fi
663 
664 		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
665 			blknames=($(get_block_dev_from_bdf "$bdf"))
666 		else
667 			blknames=("-")
668 		fi
669 
670 		desc=""
671 		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
672 		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
673 		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
674 		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
675 		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
676 		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
677 
678 		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
679 			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
680 			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
681 	done
682 }
683 
684 function status_freebsd() {
685 	local pci
686 
687 	status_print() (
688 		local type=$1
689 		local dev driver
690 
691 		shift
692 
693 		for pci; do
694 			printf '%-8s %-15s %-6s %-6s %-16s\n' \
695 				"$type" \
696 				"$pci" \
697 				"${pci_ids_vendor["$pci"]}" \
698 				"${pci_ids_device["$pci"]}" \
699 				"${pci_bus_driver["$pci"]}"
700 		done | sort -k2,2
701 	)
702 
703 	local contigmem=present
704 	local contigmem_buffer_size
705 	local contigmem_num_buffers
706 
707 	if ! kldstat -q -m contigmem; then
708 		contigmem="not present"
709 	fi
710 	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
711 		contigmem_buffer_size="not set"
712 	fi
713 	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
714 		contigmem_num_buffers="not set"
715 	fi
716 
717 	cat <<- BSD_INFO
718 		Contigmem ($contigmem)
719 		Buffer Size: $contigmem_buffer_size
720 		Num Buffers: $contigmem_num_buffers
721 
722 	BSD_INFO
723 
724 	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
725 		"Type" "BDF" "Vendor" "Device" "Driver" >&2
726 
727 	status_print "NVMe" "${!nvme_d[@]}"
728 	status_print "I/OAT" "${!ioat_d[@]}"
729 	status_print "DSA" "${!dsa_d[@]}"
730 	status_print "IAA" "${!iaa_d[@]}"
731 	status_print "VMD" "${!vmd_d[@]}"
732 }
733 
734 function configure_freebsd_pci() {
735 	local BDFS
736 
737 	BDFS+=("${!nvme_d[@]}")
738 	BDFS+=("${!ioat_d[@]}")
739 	BDFS+=("${!dsa_d[@]}")
740 	BDFS+=("${!iaa_d[@]}")
741 	BDFS+=("${!vmd_d[@]}")
742 
743 	# Drop the domain part from all the addresses
744 	BDFS=("${BDFS[@]#*:}")
745 
746 	local IFS=","
747 	kldunload nic_uio.ko || true
748 	kenv hw.nic_uio.bdfs="${BDFS[*]}"
749 	kldload nic_uio.ko
750 }
751 
752 function configure_freebsd() {
753 	if ! check_for_driver_freebsd; then
754 		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
755 		return 1
756 	fi
757 	configure_freebsd_pci
758 	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
759 	#  previous value, unload contigmem so that we can reload with the new value.
760 	if kldstat -q -m contigmem; then
761 		# contigmem may be loaded, but the kernel environment doesn't have to
762 		# be necessarily set at this point. If it isn't, kenv will fail to
763 		# pick up the hw. options. Handle it.
764 		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
765 			contigmem_num_buffers=-1
766 		fi 2> /dev/null
767 		if ((contigmem_num_buffers != HUGEMEM / 256)); then
768 			kldunload contigmem.ko
769 		fi
770 	fi
771 	if ! kldstat -q -m contigmem; then
772 		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
773 		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
774 		kldload contigmem.ko
775 	fi
776 }
777 
778 function reset_freebsd() {
779 	kldunload contigmem.ko || true
780 	kldunload nic_uio.ko || true
781 }
782 
783 CMD=reset cache_pci_bus
784 
785 mode=$1
786 
787 if [ -z "$mode" ]; then
788 	mode="config"
789 fi
790 
791 : ${HUGEMEM:=2048}
792 : ${PCI_ALLOWED:=""}
793 : ${PCI_BLOCKED:=""}
794 
795 if [ -n "$NVME_ALLOWED" ]; then
796 	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
797 fi
798 
799 if [ -n "$SKIP_PCI" ]; then
800 	PCI_ALLOWED="none"
801 fi
802 
803 if [ -z "$TARGET_USER" ]; then
804 	TARGET_USER="$SUDO_USER"
805 	if [ -z "$TARGET_USER" ]; then
806 		TARGET_USER=$(logname 2> /dev/null) || true
807 	fi
808 fi
809 
810 collect_devices "$mode"
811 
812 if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
813 	# Note that this will wait only for the first block device attached to
814 	# a given storage controller. For nvme this may miss some of the devs
815 	# in case multiple namespaces are being in place.
816 	# FIXME: Wait for nvme controller(s) to be in live state and determine
817 	# number of configured namespaces, build list of potential block devs
818 	# and pass them to sync_dev_uevents. Is it worth the effort?
819 	bdfs_to_wait_for=()
820 	for bdf in "${!all_devices_d[@]}"; do
821 		((all_devices_d["$bdf"] == 0)) || continue
822 		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
823 			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
824 			bdfs_to_wait_for+=("$bdf")
825 		fi
826 	done
827 	if ((${#bdfs_to_wait_for[@]} > 0)); then
828 		echo "Waiting for block devices as requested"
829 		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
830 		"$rootdir/scripts/sync_dev_uevents.sh" \
831 			block/disk \
832 			"${bdfs_to_wait_for[@]}" &
833 		sync_pid=$!
834 	fi
835 fi
836 
837 if [[ $os == Linux ]]; then
838 	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
839 		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
840 		unset -v HUGEPGSZ
841 	fi
842 
843 	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
844 	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
845 	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
846 
847 	if [ "$mode" == "config" ]; then
848 		configure_linux
849 	elif [ "$mode" == "cleanup" ]; then
850 		cleanup_linux
851 		clear_hugepages
852 	elif [ "$mode" == "reset" ]; then
853 		reset_linux
854 	elif [ "$mode" == "status" ]; then
855 		status_linux
856 	elif [ "$mode" == "help" ]; then
857 		usage $0
858 	else
859 		usage $0 "Invalid argument '$mode'"
860 	fi
861 else
862 	if [ "$mode" == "config" ]; then
863 		configure_freebsd
864 	elif [ "$mode" == "reset" ]; then
865 		reset_freebsd
866 	elif [ "$mode" == "cleanup" ]; then
867 		echo "setup.sh cleanup function not yet supported on $os"
868 	elif [ "$mode" == "status" ]; then
869 		status_freebsd
870 	elif [ "$mode" == "help" ]; then
871 		usage $0
872 	else
873 		usage $0 "Invalid argument '$mode'"
874 	fi
875 fi
876 
877 if [[ -e /proc/$sync_pid/status ]]; then
878 	wait "$sync_pid"
879 fi
880