xref: /spdk/scripts/setup.sh (revision 60982c759db49b4f4579f16e3b24df0725ba4b94)
1 #!/usr/bin/env bash
2 #  SPDX-License-Identifier: BSD-3-Clause
3 #  Copyright (C) 2016 Intel Corporation
4 #  All rights reserved.
5 #
6 set -e
7 shopt -s nullglob extglob
8 
9 os=$(uname -s)
10 
11 if [[ $os != Linux && $os != FreeBSD ]]; then
12 	echo "Not supported platform ($os), aborting"
13 	exit 1
14 fi
15 
16 rootdir=$(readlink -f $(dirname $0))/..
17 source "$rootdir/scripts/common.sh"
18 
19 function usage() {
20 	if [[ $os == Linux ]]; then
21 		options="[config|reset|status|cleanup|help]"
22 	else
23 		options="[config|reset|help]"
24 	fi
25 
26 	[[ -n $2 ]] && (
27 		echo "$2"
28 		echo ""
29 	)
30 	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31 	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32 	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33 	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34 	echo "Usage: $(basename $1) $options"
35 	echo
36 	echo "$options - as following:"
37 	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38 	if [[ $os == Linux ]]; then
39 		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40 	fi
41 	echo "reset             Rebind PCI devices back to their original drivers."
42 	echo "                  Also cleanup any leftover spdk files/resources."
43 	echo "                  Hugepage memory size will remain unchanged."
44 	if [[ $os == Linux ]]; then
45 		echo "status            Print status of all SPDK-compatible devices on the system."
46 	fi
47 	echo "help              Print this help message."
48 	echo
49 	echo "The following environment variables can be specified."
50 	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
51 	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
52 	echo "                  default."
53 	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
54 	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
55 	echo "                  Uses kernel's default for hugepages size."
56 	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
57 	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
58 	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
59 	echo "                  Hugepages can be defined per node with e.g.:"
60 	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
61 	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
62 	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
63 	echo "                  setting is used."
64 	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
65 	echo "                  number of requested hugepages is lower from what's already"
66 	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
67 	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
68 	echo "                  be made prior to allocation".
69 	echo "PCI_ALLOWED"
70 	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
71 	echo "                  Each device must be specified as a full PCI address."
72 	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
73 	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
74 	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
75 	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
76 	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
77 	echo "                  will be bound."
78 	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
79 	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
80 	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
81 	echo "                  By default the current user will be used."
82 	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
83 	echo "                  bind devices to the given driver."
84 	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
85 	echo "PCI_BLOCK_SYNC_ON_RESET"
86 	echo "                  If set in the environment, the attempt to wait for block devices associated"
87 	echo "                  with given PCI device will be made upon reset"
88 	echo "UNBIND_ENTIRE_IOMMU_GROUP"
89 	echo "                  If set, all devices from nvme's iommu group will be unbound from their drivers."
90 	echo "                  Use with caution."
91 	exit 0
92 }
93 
94 # In monolithic kernels the lsmod won't work. So
95 # back that with a /sys/modules. We also check
96 # /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
97 # contain needed info (like in Fedora-like OS).
98 function check_for_driver() {
99 	if [[ -z $1 ]]; then
100 		return 0
101 	fi
102 
103 	if lsmod | grep -q ${1//-/_}; then
104 		return 1
105 	fi
106 
107 	if [[ -d /sys/module/${1} || -d \
108 		/sys/module/${1//-/_} || -d \
109 		/sys/bus/pci/drivers/${1} || -d \
110 		/sys/bus/pci/drivers/${1//-/_} ]]; then
111 		return 2
112 	fi
113 	return 0
114 }
115 
116 function check_for_driver_freebsd() {
117 	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
118 	local search_paths path driver
119 	IFS=";" read -ra search_paths < <(kldconfig -rU)
120 
121 	for driver in contigmem.ko nic_uio.ko; do
122 		for path in "${search_paths[@]}"; do
123 			[[ -f $path/$driver ]] && continue 2
124 		done
125 		return 1
126 	done
127 	return 0
128 }
129 
130 function pci_dev_echo() {
131 	local bdf="$1"
132 	shift
133 	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
134 }
135 
136 function probe_driver() {
137 	local bdf=$1
138 	local driver_name=$2
139 	old_driver_name=${drivers_d["$bdf"]:-no driver}
140 
141 	if [[ $driver_name == "$old_driver_name" ]]; then
142 		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
143 		return 0
144 	fi
145 
146 	if [[ $old_driver_name != "no driver" ]]; then
147 		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
148 	fi
149 
150 	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
151 
152 	if [[ $driver_name == "none" ]]; then
153 		return 0
154 	fi
155 
156 	local probe_attempts=0
157 	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
158 	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
159 		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
160 		sleep 0.5
161 	done 2> /dev/null
162 
163 	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
164 
165 	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
166 		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
167 		return 1
168 	fi
169 }
170 
171 function linux_bind_driver() {
172 	local bdf="$1"
173 	local driver_name="$2"
174 
175 	probe_driver "$bdf" "$driver_name"
176 
177 	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
178 	if [ -e "/dev/vfio/$iommu_group" ]; then
179 		if [ -n "$TARGET_USER" ]; then
180 			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
181 		fi
182 	fi
183 
184 	local iommug=("/sys/bus/pci/devices/$bdf/iommu_group/devices/"!($bdf))
185 	local _bdf _driver
186 	if ((${#iommug[@]} > 0)) && [[ $driver_name == vfio* ]]; then
187 		pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!"
188 		for _bdf in "${iommug[@]}"; do
189 			_driver=$(readlink -f "$_bdf/driver")
190 			if [[ ! -e $_driver || ${_driver##*/} == "$driver_name" ]]; then
191 				continue
192 			fi
193 			# See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device()
194 			pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver##*/})"
195 			pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound"
196 			if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then
197 				pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}"
198 				drivers_d["${_bdf##*/}"]=${_driver##*/}
199 				probe_driver "${_bdf##*/}" none
200 			fi
201 		done
202 	fi
203 
204 }
205 
206 function linux_unbind_driver() {
207 	local bdf="$1"
208 	local old_driver_name=${drivers_d["$bdf"]:-no driver}
209 
210 	if [[ $old_driver_name == "no driver" ]]; then
211 		pci_dev_echo "$bdf" "Not bound to any driver"
212 		return 0
213 	fi
214 
215 	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
216 		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
217 		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
218 	fi
219 
220 	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
221 }
222 
223 function linux_hugetlbfs_mounts() {
224 	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
225 }
226 
227 function get_block_dev_from_bdf() {
228 	local bdf=$1
229 	local block blocks=() ctrl
230 
231 	for block in /sys/block/*; do
232 		if [[ $block == *nvme* ]]; then
233 			ctrl=${block##*/} ctrl=${ctrl%n*}
234 			if [[ -e /sys/class/nvme/$ctrl && $(< "/sys/class/nvme/$ctrl/address") == "$bdf" ]]; then
235 				blocks+=("${block##*/}")
236 			fi
237 		elif [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
238 			blocks+=("${block##*/}")
239 		fi
240 	done
241 	printf '%s\n' "${blocks[@]}"
242 }
243 
244 function get_used_bdf_block_devs() {
245 	local bdf=$1
246 	local blocks block blockp dev mount holder
247 	local used
248 
249 	hash lsblk &> /dev/null || return 1
250 	blocks=($(get_block_dev_from_bdf "$bdf"))
251 
252 	for block in "${blocks[@]}"; do
253 		# Check if the device is hold by some other, regardless if it's mounted
254 		# or not.
255 		for holder in "/sys/class/block/$block"*/holders/*; do
256 			[[ -e $holder ]] || continue
257 			blockp=${holder%/holders*} blockp=${blockp##*/}
258 			if [[ -e $holder/slaves/$blockp ]]; then
259 				used+=("holder@$blockp:${holder##*/}")
260 			fi
261 		done
262 		while read -r dev mount; do
263 			if [[ -e $mount ]]; then
264 				used+=("mount@$block:$dev")
265 			fi
266 		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
267 		if ((${#used[@]} == 0)); then
268 			# Make sure we check if there's any valid data present on the target device
269 			# regardless if it's being actively used or not. This is mainly done to make
270 			# sure we don't miss more complex setups like ZFS pools, etc.
271 			if block_in_use "$block" > /dev/null; then
272 				used+=("data@$block")
273 			fi
274 		fi
275 	done
276 
277 	if ((${#used[@]} > 0)); then
278 		printf '%s\n' "${used[@]}"
279 	fi
280 }
281 
282 function collect_devices() {
283 	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
284 
285 	local ids dev_type dev_id bdf bdfs in_use driver
286 
287 	ids+="PCI_DEVICE_ID_INTEL_IOAT"
288 	ids+="|PCI_DEVICE_ID_INTEL_DSA"
289 	ids+="|PCI_DEVICE_ID_INTEL_IAA"
290 	ids+="|PCI_DEVICE_ID_VIRTIO"
291 	ids+="|PCI_DEVICE_ID_INTEL_VMD"
292 	ids+="|SPDK_PCI_CLASS_NVME"
293 
294 	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
295 
296 	while read -r _ dev_type dev_id; do
297 		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
298 		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
299 		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
300 		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
301 		for bdf in "${bdfs[@]}"; do
302 			in_use=0
303 			if [[ $1 != status ]]; then
304 				if ! pci_can_use "$bdf"; then
305 					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
306 					in_use=1
307 				fi
308 				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
309 					if ! verify_bdf_block_devs "$bdf"; then
310 						in_use=1
311 					fi
312 				fi
313 				if [[ $dev_type == vmd ]]; then
314 					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
315 						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
316 						in_use=1
317 					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
318 						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
319 							if [ "$mode" == "config" ]; then
320 								cat <<- MESSAGE
321 									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
322 									which are attached to the kernel NVMe driver,the binding process may go faster
323 									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
324 									NVMe SSDs, and then run again to unbind the VMD devices."
325 								MESSAGE
326 							fi
327 						fi
328 					fi
329 				fi
330 			fi
331 			eval "${dev_type}_d[$bdf]=$in_use"
332 			all_devices_d["$bdf"]=$in_use
333 			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
334 				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
335 				drivers_d["$bdf"]=${driver##*/}
336 			fi
337 		done
338 	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
339 }
340 
341 function collect_driver() {
342 	local bdf=$1
343 	local drivers driver
344 
345 	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
346 		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
347 		# Pick first entry in case multiple aliases are bound to a driver.
348 		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
349 		driver=${driver##*/}
350 	else
351 		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
352 		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
353 		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
354 		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
355 		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
356 		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
357 	fi 2> /dev/null
358 	echo "$driver"
359 }
360 
361 function verify_bdf_block_devs() {
362 	local bdf=$1
363 	local blknames
364 	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
365 
366 	if ((${#blknames[@]} > 0)); then
367 		local IFS=","
368 		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
369 		return 1
370 	fi
371 }
372 
373 function configure_linux_pci() {
374 	local driver_path=""
375 	driver_name=""
376 	igb_uio_fallback=""
377 
378 	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
379 		# igb_uio is a common driver to override with and it depends on uio.
380 		modprobe uio || true
381 		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
382 			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
383 		fi
384 	fi
385 
386 	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
387 		driver_name=none
388 	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
389 		driver_path="$DRIVER_OVERRIDE"
390 		driver_name="${DRIVER_OVERRIDE##*/}"
391 		# modprobe and the sysfs don't use the .ko suffix.
392 		driver_name=${driver_name%.ko}
393 		# path = name -> there is no path
394 		if [[ "$driver_path" = "$driver_name" ]]; then
395 			driver_path=""
396 		fi
397 	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
398 	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
399 	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
400 		driver_name=vfio-pci
401 		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
402 		# should be done automatically by modprobe since this particular module should
403 		# be a part of vfio-pci dependencies, however, on some distros, it seems that
404 		# it's not the case. See #1689.
405 		if modinfo vfio_iommu_type1 > /dev/null; then
406 			modprobe vfio_iommu_type1
407 		fi
408 	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
409 		driver_name=uio_pci_generic
410 	elif [[ -e $igb_uio_fallback ]]; then
411 		driver_path="$igb_uio_fallback"
412 		driver_name="igb_uio"
413 		echo "WARNING: uio_pci_generic not detected - using $driver_name"
414 	else
415 		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
416 		return 1
417 	fi
418 
419 	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
420 	if [[ $driver_name != "none" ]]; then
421 		if [[ -n "$driver_path" ]]; then
422 			insmod $driver_path || true
423 		else
424 			modprobe $driver_name
425 		fi
426 	fi
427 
428 	for bdf in "${!all_devices_d[@]}"; do
429 		if ((all_devices_d["$bdf"] == 0)); then
430 			if [[ -n ${nvme_d["$bdf"]} ]]; then
431 				# Some nvme controllers may take significant amount of time while being
432 				# unbound from the driver. Put that task into background to speed up the
433 				# whole process. Currently this is done only for the devices bound to the
434 				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
435 				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
436 				linux_bind_driver "$bdf" "$driver_name" &
437 			else
438 				linux_bind_driver "$bdf" "$driver_name"
439 			fi
440 		fi
441 	done
442 	wait
443 
444 	echo "1" > "/sys/bus/pci/rescan"
445 }
446 
447 function cleanup_linux() {
448 	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
449 	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
450 
451 	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
452 	if [[ -d $XDG_RUNTIME_DIR ]]; then
453 		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
454 	fi
455 
456 	for dir in "${dirs_to_clean[@]}"; do
457 		files_to_clean+=("$dir/"*)
458 	done
459 	file_locks+=(/var/tmp/spdk_pci_lock*)
460 	file_locks+=(/var/tmp/spdk_cpu_lock*)
461 
462 	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
463 	files_to_clean+=("${file_locks[@]}")
464 
465 	# This may fail in case path that readlink attempts to resolve suddenly
466 	# disappears (as it may happen with terminating processes).
467 	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
468 
469 	if ((${#opened_files[@]} == 0)); then
470 		echo "Can't get list of opened files!"
471 		exit 1
472 	fi
473 
474 	echo 'Cleaning'
475 	for f in "${files_to_clean[@]}"; do
476 		[[ -e $f ]] || continue
477 		if [[ ${opened_files[*]} != *"$f"* ]]; then
478 			echo "Removing:    $f"
479 			rm $f
480 		else
481 			echo "Still open: $f"
482 		fi
483 	done
484 
485 	for dir in "${dirs_to_clean[@]}"; do
486 		[[ -d $dir ]] || continue
487 		if [[ ${opened_files[*]} != *"$dir"* ]]; then
488 			echo "Removing:    $dir"
489 			rmdir $dir
490 		else
491 			echo "Still open: $dir"
492 		fi
493 	done
494 	echo "Clean"
495 }
496 
497 check_hugepages_alloc() {
498 	local hp_int=$1
499 	local allocated_hugepages
500 
501 	allocated_hugepages=$(< "$hp_int")
502 
503 	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
504 		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
505 		return 0
506 	fi
507 
508 	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
509 
510 	allocated_hugepages=$(< "$hp_int")
511 	if ((allocated_hugepages < NRHUGE)); then
512 		cat <<- ERROR
513 
514 			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
515 			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
516 		ERROR
517 		return 1
518 	fi
519 }
520 
521 clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
522 
523 configure_linux_hugepages() {
524 	local node system_nodes
525 	local nodes_to_use nodes_hp
526 
527 	if [[ $CLEAR_HUGE == yes ]]; then
528 		clear_hugepages
529 	fi
530 
531 	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
532 		clear_hugepages
533 		check_hugepages_alloc /proc/sys/vm/nr_hugepages
534 		return 0
535 	fi
536 
537 	for node in /sys/devices/system/node/node*; do
538 		[[ -e $node ]] || continue
539 		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
540 	done
541 
542 	if ((${#nodes[@]} == 0)); then
543 		# No NUMA support? Fallback to common interface
544 		check_hugepages_alloc /proc/sys/vm/nr_hugepages
545 		return 0
546 	fi
547 
548 	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
549 	if ((${#nodes_to_use[@]} == 0)); then
550 		nodes_to_use[0]=0
551 	fi
552 
553 	# Align indexes with node ids
554 	for node in "${!nodes_to_use[@]}"; do
555 		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
556 			eval "${nodes_to_use[node]}"
557 		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
558 			nodes_hp[nodes_to_use[node]]=$NRHUGE
559 		fi
560 	done
561 
562 	for node in "${!nodes_hp[@]}"; do
563 		if [[ -z ${nodes[node]} ]]; then
564 			echo "Node $node doesn't exist, ignoring" >&2
565 			continue
566 		fi
567 		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
568 	done
569 }
570 
571 function configure_linux() {
572 	configure_linux_pci
573 	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
574 
575 	if [ -z "$hugetlbfs_mounts" ]; then
576 		hugetlbfs_mounts=/mnt/huge
577 		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
578 		mkdir -p "$hugetlbfs_mounts"
579 		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
580 	fi
581 
582 	configure_linux_hugepages
583 
584 	if [ "$driver_name" = "vfio-pci" ]; then
585 		if [ -n "$TARGET_USER" ]; then
586 			for mount in $hugetlbfs_mounts; do
587 				chown "$TARGET_USER" "$mount"
588 				chmod g+w "$mount"
589 			done
590 
591 			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
592 			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
593 				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
594 				cat <<- MEMLOCK
595 					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
596 
597 					This is the maximum amount of memory you will be
598 					able to use with DPDK and VFIO if run as user "$TARGET_USER".
599 					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
600 				MEMLOCK
601 				if ((MEMLOCK_AMNT < 65536)); then
602 					echo ""
603 					echo "## WARNING: memlock limit is less than 64MB"
604 					echo -n "## DPDK with VFIO may not be able to initialize "
605 					echo "if run as user \"$TARGET_USER\"."
606 				fi
607 			fi
608 		fi
609 	fi
610 
611 	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
612 		# Some distros build msr as a module.  Make sure it's loaded to ensure
613 		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
614 		#  sleeps.
615 		modprobe msr &> /dev/null || true
616 	fi
617 }
618 
619 function reset_linux_pci() {
620 	# virtio
621 	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
622 	# Requires some more investigation - for example, some kernels do not seem to have
623 	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
624 	#  underscore vs. dash right in the virtio_scsi name.
625 	modprobe virtio-pci || true
626 	for bdf in "${!all_devices_d[@]}"; do
627 		((all_devices_d["$bdf"] == 0)) || continue
628 
629 		driver=$(collect_driver "$bdf")
630 		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
631 			linux_bind_driver "$bdf" "$driver"
632 		else
633 			linux_unbind_driver "$bdf"
634 		fi
635 	done
636 
637 	echo "1" > "/sys/bus/pci/rescan"
638 }
639 
640 function reset_linux() {
641 	reset_linux_pci
642 	for mount in $(linux_hugetlbfs_mounts); do
643 		for hp in "$mount"/spdk*map_*; do
644 			flock -n "$hp" true && rm -f "$hp"
645 		done
646 	done
647 	rm -f /run/.spdk*
648 }
649 
650 function status_linux() {
651 	echo "Hugepages" >&2
652 	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
653 
654 	numa_nodes=0
655 	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
656 		numa_nodes=$((numa_nodes + 1))
657 		free_pages=$(cat $path/free_hugepages)
658 		all_pages=$(cat $path/nr_hugepages)
659 
660 		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
661 
662 		node=${BASH_REMATCH[1]}
663 		huge_size=${BASH_REMATCH[2]}
664 
665 		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
666 	done
667 
668 	# fall back to system-wide hugepages
669 	if [ "$numa_nodes" = "0" ]; then
670 		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
671 		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
672 		node="-"
673 		huge_size="$HUGEPGSZ"
674 
675 		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
676 	fi
677 
678 	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
679 		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
680 
681 	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
682 
683 	for bdf in "${sorted_bdfs[@]}"; do
684 		driver=${drivers_d["$bdf"]}
685 		if [ "$numa_nodes" = "0" ]; then
686 			node="-"
687 		else
688 			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
689 			if ((node == -1)); then
690 				node=unknown
691 			fi
692 		fi
693 		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
694 			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
695 		else
696 			name="-"
697 		fi
698 
699 		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
700 			blknames=($(get_block_dev_from_bdf "$bdf"))
701 		else
702 			blknames=("-")
703 		fi
704 
705 		desc=""
706 		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
707 		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
708 		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
709 		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
710 		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
711 		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
712 
713 		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
714 			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
715 			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
716 	done
717 }
718 
719 function status_freebsd() {
720 	local pci
721 
722 	status_print() (
723 		local type=$1
724 		local dev driver
725 
726 		shift
727 
728 		for pci; do
729 			printf '%-8s %-15s %-6s %-6s %-16s\n' \
730 				"$type" \
731 				"$pci" \
732 				"${pci_ids_vendor["$pci"]}" \
733 				"${pci_ids_device["$pci"]}" \
734 				"${pci_bus_driver["$pci"]}"
735 		done | sort -k2,2
736 	)
737 
738 	local contigmem=present
739 	local contigmem_buffer_size
740 	local contigmem_num_buffers
741 
742 	if ! kldstat -q -m contigmem; then
743 		contigmem="not present"
744 	fi
745 	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
746 		contigmem_buffer_size="not set"
747 	fi
748 	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
749 		contigmem_num_buffers="not set"
750 	fi
751 
752 	cat <<- BSD_INFO
753 		Contigmem ($contigmem)
754 		Buffer Size: $contigmem_buffer_size
755 		Num Buffers: $contigmem_num_buffers
756 
757 	BSD_INFO
758 
759 	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
760 		"Type" "BDF" "Vendor" "Device" "Driver" >&2
761 
762 	status_print "NVMe" "${!nvme_d[@]}"
763 	status_print "I/OAT" "${!ioat_d[@]}"
764 	status_print "DSA" "${!dsa_d[@]}"
765 	status_print "IAA" "${!iaa_d[@]}"
766 	status_print "VMD" "${!vmd_d[@]}"
767 }
768 
769 function configure_freebsd_pci() {
770 	local BDFS
771 
772 	BDFS+=("${!nvme_d[@]}")
773 	BDFS+=("${!ioat_d[@]}")
774 	BDFS+=("${!dsa_d[@]}")
775 	BDFS+=("${!iaa_d[@]}")
776 	BDFS+=("${!vmd_d[@]}")
777 
778 	# Drop the domain part from all the addresses
779 	BDFS=("${BDFS[@]#*:}")
780 
781 	local IFS=","
782 	kldunload nic_uio.ko || true
783 	kenv hw.nic_uio.bdfs="${BDFS[*]}"
784 	kldload nic_uio.ko
785 }
786 
787 function configure_freebsd() {
788 	if ! check_for_driver_freebsd; then
789 		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
790 		return 1
791 	fi
792 	configure_freebsd_pci
793 	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
794 	#  previous value, unload contigmem so that we can reload with the new value.
795 	if kldstat -q -m contigmem; then
796 		# contigmem may be loaded, but the kernel environment doesn't have to
797 		# be necessarily set at this point. If it isn't, kenv will fail to
798 		# pick up the hw. options. Handle it.
799 		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
800 			contigmem_num_buffers=-1
801 		fi 2> /dev/null
802 		if ((contigmem_num_buffers != HUGEMEM / 256)); then
803 			kldunload contigmem.ko
804 		fi
805 	fi
806 	if ! kldstat -q -m contigmem; then
807 		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
808 		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
809 		kldload contigmem.ko
810 	fi
811 }
812 
813 function reset_freebsd() {
814 	kldunload contigmem.ko || true
815 	kldunload nic_uio.ko || true
816 }
817 
818 CMD=reset cache_pci_bus
819 
820 mode=$1
821 
822 if [ -z "$mode" ]; then
823 	mode="config"
824 fi
825 
826 : ${HUGEMEM:=2048}
827 : ${PCI_ALLOWED:=""}
828 : ${PCI_BLOCKED:=""}
829 
830 if [ -n "$NVME_ALLOWED" ]; then
831 	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
832 fi
833 
834 if [ -n "$SKIP_PCI" ]; then
835 	PCI_ALLOWED="none"
836 fi
837 
838 if [ -z "$TARGET_USER" ]; then
839 	TARGET_USER="$SUDO_USER"
840 	if [ -z "$TARGET_USER" ]; then
841 		TARGET_USER=$(logname 2> /dev/null) || true
842 	fi
843 fi
844 
845 collect_devices "$mode"
846 
847 if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
848 	# Note that this will wait only for the first block device attached to
849 	# a given storage controller. For nvme this may miss some of the devs
850 	# in case multiple namespaces are being in place.
851 	# FIXME: Wait for nvme controller(s) to be in live state and determine
852 	# number of configured namespaces, build list of potential block devs
853 	# and pass them to sync_dev_uevents. Is it worth the effort?
854 	bdfs_to_wait_for=()
855 	for bdf in "${!all_devices_d[@]}"; do
856 		((all_devices_d["$bdf"] == 0)) || continue
857 		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
858 			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
859 			bdfs_to_wait_for+=("$bdf")
860 		fi
861 	done
862 	if ((${#bdfs_to_wait_for[@]} > 0)); then
863 		echo "Waiting for block devices as requested"
864 		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
865 		"$rootdir/scripts/sync_dev_uevents.sh" \
866 			block/disk \
867 			"${bdfs_to_wait_for[@]}" &
868 		sync_pid=$!
869 	fi
870 fi
871 
872 if [[ $os == Linux ]]; then
873 	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
874 		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
875 		unset -v HUGEPGSZ
876 	fi
877 
878 	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
879 	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
880 	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
881 
882 	if [ "$mode" == "config" ]; then
883 		configure_linux
884 	elif [ "$mode" == "cleanup" ]; then
885 		cleanup_linux
886 		clear_hugepages
887 	elif [ "$mode" == "reset" ]; then
888 		reset_linux
889 	elif [ "$mode" == "status" ]; then
890 		status_linux
891 	elif [ "$mode" == "help" ]; then
892 		usage $0
893 	else
894 		usage $0 "Invalid argument '$mode'"
895 	fi
896 else
897 	if [ "$mode" == "config" ]; then
898 		configure_freebsd
899 	elif [ "$mode" == "reset" ]; then
900 		reset_freebsd
901 	elif [ "$mode" == "cleanup" ]; then
902 		echo "setup.sh cleanup function not yet supported on $os"
903 	elif [ "$mode" == "status" ]; then
904 		status_freebsd
905 	elif [ "$mode" == "help" ]; then
906 		usage $0
907 	else
908 		usage $0 "Invalid argument '$mode'"
909 	fi
910 fi
911 
912 if [[ -e /proc/$sync_pid/status ]]; then
913 	wait "$sync_pid"
914 fi
915