xref: /spdk/scripts/setup.sh (revision b02581a89058ebaebe03bd0e16e3b58adfe406c1)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|interactive|help]"
22	else
23		options="[config|reset|interactive|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "interactive       Executes script in interactive mode."
48	echo "help              Print this help message."
49	echo
50	echo "The following environment variables can be specified."
51	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
52	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
53	echo "                  default."
54	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
55	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
56	echo "                  Uses kernel's default for hugepages size."
57	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
58	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
59	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
60	echo "                  Hugepages can be defined per node with e.g.:"
61	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
62	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
63	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
64	echo "                  setting is used."
65	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
66	echo "                  number of requested hugepages is lower from what's already"
67	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
68	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
69	echo "                  be made prior to allocation".
70	echo "PCI_ALLOWED"
71	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
72	echo "                  Each device must be specified as a full PCI address."
73	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
74	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
75	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
76	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
77	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
78	echo "                  will be bound."
79	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
80	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
81	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
82	echo "                  By default the current user will be used."
83	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
84	echo "                  bind devices to the given driver."
85	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
86	echo "PCI_BLOCK_SYNC_ON_RESET"
87	echo "                  If set in the environment, the attempt to wait for block devices associated"
88	echo "                  with given PCI device will be made upon reset"
89	echo "UNBIND_ENTIRE_IOMMU_GROUP"
90	echo "                  If set, all devices from nvme's iommu group will be unbound from their drivers."
91	echo "                  Use with caution."
92	exit 0
93}
94
95# In monolithic kernels the lsmod won't work. So
96# back that with a /sys/modules. We also check
97# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
98# contain needed info (like in Fedora-like OS).
99function check_for_driver() {
100	if [[ -z $1 ]]; then
101		return 0
102	fi
103
104	if lsmod | grep -q ${1//-/_}; then
105		return 1
106	fi
107
108	if [[ -d /sys/module/${1} || -d \
109		/sys/module/${1//-/_} || -d \
110		/sys/bus/pci/drivers/${1} || -d \
111		/sys/bus/pci/drivers/${1//-/_} ]]; then
112		return 2
113	fi
114	return 0
115}
116
117function check_for_driver_freebsd() {
118	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
119	local search_paths path driver
120	IFS=";" read -ra search_paths < <(kldconfig -rU)
121
122	for driver in contigmem.ko nic_uio.ko; do
123		for path in "${search_paths[@]}"; do
124			[[ -f $path/$driver ]] && continue 2
125		done
126		return 1
127	done
128	return 0
129}
130
131function pci_dev_echo() {
132	local bdf="$1"
133	shift
134	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
135}
136
137function probe_driver() {
138	local bdf=$1
139	local driver_name=$2
140	old_driver_name=${drivers_d["$bdf"]:-no driver}
141
142	if [[ $driver_name == "$old_driver_name" ]]; then
143		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
144		return 0
145	fi
146
147	if [[ $old_driver_name != "no driver" ]]; then
148		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
149	fi
150
151	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
152
153	if [[ $driver_name == "none" ]]; then
154		return 0
155	fi
156
157	local probe_attempts=0
158	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
159	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
160		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
161		sleep 0.5
162	done 2> /dev/null
163
164	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
165
166	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
167		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
168		return 1
169	fi
170}
171
172function linux_bind_driver() {
173	local bdf="$1"
174	local driver_name="$2"
175
176	probe_driver "$bdf" "$driver_name"
177
178	local iommu_group=${pci_iommu_groups["$bdf"]}
179	if [ -e "/dev/vfio/$iommu_group" ]; then
180		if [ -n "$TARGET_USER" ]; then
181			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
182		fi
183	fi
184
185	local iommug=("${!iommu_groups[iommu_group]}")
186	local _bdf _driver
187	if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then
188		pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!"
189		for _bdf in "${iommug[@]}"; do
190			[[ $_bdf == "$bdf" ]] && continue
191			_driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/}
192			if [[ $_driver == "$driver_name" ]]; then
193				continue
194			fi
195			# See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device()
196			pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver##*/})"
197			pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound"
198			if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then
199				pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}"
200				drivers_d["${_bdf##*/}"]=${_driver##*/}
201				probe_driver "${_bdf##*/}" none
202			fi
203		done
204	fi
205
206}
207
208function linux_unbind_driver() {
209	local bdf="$1"
210	local old_driver_name=${drivers_d["$bdf"]:-no driver}
211
212	if [[ $old_driver_name == "no driver" ]]; then
213		pci_dev_echo "$bdf" "Not bound to any driver"
214		return 0
215	fi
216
217	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
218		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
219		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
220	fi
221
222	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
223}
224
225function linux_hugetlbfs_mounts() {
226	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
227}
228
229function get_block_dev_from_bdf() {
230	local bdf=$1
231	local block blocks=() ctrl
232
233	for block in /sys/block/*; do
234		if [[ $block == *nvme* ]]; then
235			ctrl=$(readlink -f "$block/device") ctrl=${ctrl##*/}
236			if [[ -e /sys/class/nvme/$ctrl && $(< "/sys/class/nvme/$ctrl/address") == "$bdf" ]]; then
237				blocks+=("${block##*/}")
238			fi
239		elif [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
240			blocks+=("${block##*/}")
241		fi
242	done
243	printf '%s\n' "${blocks[@]}"
244}
245
246function get_used_bdf_block_devs() {
247	local bdf=$1
248	local blocks block blockp dev mount holder
249	local used
250
251	hash lsblk &> /dev/null || return 1
252	blocks=($(get_block_dev_from_bdf "$bdf"))
253
254	for block in "${blocks[@]}"; do
255		# Check if the device is hold by some other, regardless if it's mounted
256		# or not.
257		for holder in "/sys/class/block/$block"*/holders/*; do
258			[[ -e $holder ]] || continue
259			blockp=${holder%/holders*} blockp=${blockp##*/}
260			if [[ -e $holder/slaves/$blockp ]]; then
261				used+=("holder@$blockp:${holder##*/}")
262			fi
263		done
264		while read -r dev mount; do
265			if [[ -e $mount ]]; then
266				used+=("mount@$block:$dev")
267			fi
268		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
269		if ((${#used[@]} == 0)); then
270			# Make sure we check if there's any valid data present on the target device
271			# regardless if it's being actively used or not. This is mainly done to make
272			# sure we don't miss more complex setups like ZFS pools, etc.
273			if block_in_use "$block" > /dev/null; then
274				used+=("data@$block")
275			fi
276		fi
277	done
278
279	if ((${#used[@]} > 0)); then
280		printf '%s\n' "${used[@]}"
281	fi
282}
283
284is_nvme_behind_vmd() {
285	local nvme_bdf=$1 dev_path
286
287	IFS="/" read -ra dev_path < <(readlink -f "/sys/bus/pci/devices/$nvme_bdf")
288
289	for dev in "${dev_path[@]}"; do
290		[[ -n $dev && -n ${vmd_d["$dev"]} ]] && echo $dev && return 0
291	done
292	return 1
293}
294
295is_nvme_iommu_shared_with_vmd() {
296	local nvme_bdf=$1 vmd
297
298	# This use-case is quite specific to vfio-pci|iommu setup
299	is_iommu_enabled || return 1
300
301	[[ -n ${nvme_vmd_d["$nvme_bdf"]} ]] || return 1
302	# nvme is behind VMD ...
303	((pci_iommu_groups["$nvme_bdf"] == pci_iommu_groups["${nvme_vmd_d["$nvme_bdf"]}"])) || return 1
304	# ... and it shares iommu_group with it
305}
306
307function collect_devices() {
308	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
309
310	local ids dev_type dev_id bdf bdfs in_use driver _vmd
311
312	ids+="PCI_DEVICE_ID_INTEL_IOAT"
313	ids+="|PCI_DEVICE_ID_INTEL_DSA"
314	ids+="|PCI_DEVICE_ID_INTEL_IAA"
315	ids+="|PCI_DEVICE_ID_VIRTIO"
316	ids+="|PCI_DEVICE_ID_INTEL_VMD"
317	ids+="|SPDK_PCI_CLASS_NVME"
318
319	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d types_d all_devices_type_d nvme_vmd_d
320
321	while read -r _ dev_type dev_id; do
322		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
323		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
324		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
325		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
326		types_d["$dev_type"]=1
327		for bdf in "${bdfs[@]}"; do
328			in_use=0
329			if [[ $1 != status ]]; then
330				if ! pci_can_use "$bdf"; then
331					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
332					in_use=1
333				fi
334				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
335					if ! verify_bdf_block_devs "$bdf"; then
336						in_use=1
337					fi
338				fi
339				if [[ $dev_type == vmd ]]; then
340					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
341						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
342						in_use=1
343					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
344						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
345							if [ "$mode" == "config" ]; then
346								cat <<- MESSAGE
347									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
348									which are attached to the kernel NVMe driver,the binding process may go faster
349									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
350									NVMe SSDs, and then run again to unbind the VMD devices."
351								MESSAGE
352							fi
353						fi
354					fi
355				fi
356			fi
357			eval "${dev_type}_d[$bdf]=$in_use"
358			all_devices_d["$bdf"]=$in_use
359			all_devices_type_d["$bdf"]=$dev_type
360			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
361				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
362				drivers_d["$bdf"]=${driver##*/}
363			else
364				drivers_d["$bdf"]=""
365			fi
366		done
367	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
368
369	for bdf in "${!nvme_d[@]}"; do
370		_vmd=$(is_nvme_behind_vmd "$bdf") && nvme_vmd_d["$bdf"]=$_vmd
371	done
372
373	# Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are
374	# any skip them since they won't be usable by SPDK without moving the entire VMD ctrl
375	# away from the kernel first. That said, allow to touch the nvmes in case user requested
376	# all devices to be unbound from any driver or if dedicated override flag was set.
377	[[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0
378
379	for bdf in "${!nvme_d[@]}"; do
380		is_nvme_iommu_shared_with_vmd "$bdf" || continue
381		nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1
382		pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})"
383	done
384
385	return 0
386}
387
388function collect_driver() {
389	local bdf=$1
390	local drivers driver
391
392	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
393		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
394		# Pick first entry in case multiple aliases are bound to a driver.
395		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
396		driver=${driver##*/}
397	else
398		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
399		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
400		[[ -n ${dsa_d["$bdf"]} ]] && driver=idxd
401		[[ -n ${iaa_d["$bdf"]} ]] && driver=idxd
402		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
403		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
404	fi 2> /dev/null
405	echo "$driver"
406}
407
408function verify_bdf_block_devs() {
409	local bdf=$1
410	local blknames
411	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
412
413	if ((${#blknames[@]} > 0)); then
414		local IFS=","
415		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
416		return 1
417	fi
418}
419
420function configure_linux_pci() {
421	local driver_path=""
422	driver_name=""
423	igb_uio_fallback=""
424
425	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
426		# igb_uio is a common driver to override with and it depends on uio.
427		modprobe uio || true
428		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
429			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
430		fi
431	fi
432
433	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
434		driver_name=none
435	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
436		driver_path="$DRIVER_OVERRIDE"
437		driver_name="${DRIVER_OVERRIDE##*/}"
438		# modprobe and the sysfs don't use the .ko suffix.
439		driver_name=${driver_name%.ko}
440		# path = name -> there is no path
441		if [[ "$driver_path" = "$driver_name" ]]; then
442			driver_path=""
443		fi
444	elif is_iommu_enabled; then
445		driver_name=vfio-pci
446		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
447		# should be done automatically by modprobe since this particular module should
448		# be a part of vfio-pci dependencies, however, on some distros, it seems that
449		# it's not the case. See #1689.
450		if modinfo vfio_iommu_type1 > /dev/null; then
451			modprobe vfio_iommu_type1
452		fi
453	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
454		driver_name=uio_pci_generic
455	elif [[ -e $igb_uio_fallback ]]; then
456		driver_path="$igb_uio_fallback"
457		driver_name="igb_uio"
458		echo "WARNING: uio_pci_generic not detected - using $driver_name"
459	else
460		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
461		return 1
462	fi
463
464	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
465	if [[ $driver_name != "none" ]]; then
466		if [[ -n "$driver_path" ]]; then
467			insmod $driver_path || true
468		else
469			modprobe $driver_name
470		fi
471	fi
472
473	for bdf in "${!all_devices_d[@]}"; do
474		if ((all_devices_d["$bdf"] == 0)); then
475			if [[ -n ${nvme_d["$bdf"]} ]]; then
476				# Some nvme controllers may take significant amount of time while being
477				# unbound from the driver. Put that task into background to speed up the
478				# whole process. Currently this is done only for the devices bound to the
479				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
480				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
481				linux_bind_driver "$bdf" "$driver_name" &
482			else
483				linux_bind_driver "$bdf" "$driver_name"
484			fi
485		fi
486	done
487	wait
488
489	echo "1" > "/sys/bus/pci/rescan"
490}
491
492function cleanup_linux() {
493	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
494	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
495
496	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
497	if [[ -d $XDG_RUNTIME_DIR ]]; then
498		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
499	fi
500
501	for dir in "${dirs_to_clean[@]}"; do
502		files_to_clean+=("$dir/"*)
503	done
504	file_locks+=(/var/tmp/spdk_pci_lock*)
505	file_locks+=(/var/tmp/spdk_cpu_lock*)
506
507	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
508	files_to_clean+=("${file_locks[@]}")
509
510	# This may fail in case path that readlink attempts to resolve suddenly
511	# disappears (as it may happen with terminating processes).
512	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
513
514	if ((${#opened_files[@]} == 0)); then
515		echo "Can't get list of opened files!"
516		exit 1
517	fi
518
519	echo 'Cleaning'
520	for f in "${files_to_clean[@]}"; do
521		[[ -e $f ]] || continue
522		if [[ ${opened_files[*]} != *"$f"* ]]; then
523			echo "Removing:    $f"
524			rm $f
525		else
526			echo "Still open: $f"
527		fi
528	done
529
530	for dir in "${dirs_to_clean[@]}"; do
531		[[ -d $dir ]] || continue
532		if [[ ${opened_files[*]} != *"$dir"* ]]; then
533			echo "Removing:    $dir"
534			rmdir $dir
535		else
536			echo "Still open: $dir"
537		fi
538	done
539	echo "Clean"
540}
541
542check_hugepages_alloc() {
543	local hp_int=$1
544	local allocated_hugepages
545
546	allocated_hugepages=$(< "$hp_int")
547
548	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
549		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
550		return 0
551	fi
552
553	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
554
555	allocated_hugepages=$(< "$hp_int")
556	if ((allocated_hugepages < NRHUGE)); then
557		cat <<- ERROR
558
559			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
560			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
561		ERROR
562		return 1
563	fi
564}
565
566clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
567
568configure_linux_hugepages() {
569	local node system_nodes
570	local nodes_to_use nodes_hp
571
572	if [[ $CLEAR_HUGE == yes ]]; then
573		clear_hugepages
574	fi
575
576	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
577		clear_hugepages
578		check_hugepages_alloc /proc/sys/vm/nr_hugepages
579		return 0
580	fi
581
582	for node in /sys/devices/system/node/node*; do
583		[[ -e $node ]] || continue
584		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
585	done
586
587	if ((${#nodes[@]} == 0)); then
588		# No NUMA support? Fallback to common interface
589		check_hugepages_alloc /proc/sys/vm/nr_hugepages
590		return 0
591	fi
592
593	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
594	if ((${#nodes_to_use[@]} == 0)); then
595		nodes_to_use[0]=0
596	fi
597
598	# Align indexes with node ids
599	for node in "${!nodes_to_use[@]}"; do
600		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
601			eval "${nodes_to_use[node]}"
602		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
603			nodes_hp[nodes_to_use[node]]=$NRHUGE
604		fi
605	done
606
607	for node in "${!nodes_hp[@]}"; do
608		if [[ -z ${nodes[node]} ]]; then
609			echo "Node $node doesn't exist, ignoring" >&2
610			continue
611		fi
612		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
613	done
614}
615
616function configure_linux() {
617	configure_linux_pci
618	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
619
620	if [ -z "$hugetlbfs_mounts" ]; then
621		hugetlbfs_mounts=/mnt/huge
622		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
623		mkdir -p "$hugetlbfs_mounts"
624		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
625	fi
626
627	configure_linux_hugepages
628
629	if [ "$driver_name" = "vfio-pci" ]; then
630		if [ -n "$TARGET_USER" ]; then
631			for mount in $hugetlbfs_mounts; do
632				chown "$TARGET_USER" "$mount"
633				chmod g+w "$mount"
634			done
635
636			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
637			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
638				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
639				cat <<- MEMLOCK
640					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
641
642					This is the maximum amount of memory you will be
643					able to use with DPDK and VFIO if run as user "$TARGET_USER".
644					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
645				MEMLOCK
646				if ((MEMLOCK_AMNT < 65536)); then
647					echo ""
648					echo "## WARNING: memlock limit is less than 64MB"
649					echo -n "## DPDK with VFIO may not be able to initialize "
650					echo "if run as user \"$TARGET_USER\"."
651				fi
652			fi
653		fi
654	fi
655
656	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
657		# Some distros build msr as a module.  Make sure it's loaded to ensure
658		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
659		#  sleeps.
660		modprobe msr &> /dev/null || true
661	fi
662}
663
664function reset_linux_pci() {
665	# virtio
666	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
667	# Requires some more investigation - for example, some kernels do not seem to have
668	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
669	#  underscore vs. dash right in the virtio_scsi name.
670	modprobe virtio-pci || true
671	for bdf in "${!all_devices_d[@]}"; do
672		((all_devices_d["$bdf"] == 0)) || continue
673
674		driver=$(collect_driver "$bdf")
675		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
676			linux_bind_driver "$bdf" "$driver"
677		else
678			linux_unbind_driver "$bdf"
679		fi
680	done
681
682	echo "1" > "/sys/bus/pci/rescan"
683}
684
685function reset_linux() {
686	reset_linux_pci
687	for mount in $(linux_hugetlbfs_mounts); do
688		for hp in "$mount"/spdk*map_*; do
689			flock -n "$hp" true && rm -f "$hp"
690		done
691	done
692	rm -f /run/.spdk*
693}
694
695function status_linux() {
696	echo "Hugepages" >&2
697	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
698
699	numa_nodes=0
700	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
701		numa_nodes=$((numa_nodes + 1))
702		free_pages=$(cat $path/free_hugepages)
703		all_pages=$(cat $path/nr_hugepages)
704
705		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
706
707		node=${BASH_REMATCH[1]}
708		huge_size=${BASH_REMATCH[2]}
709
710		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
711	done
712
713	# fall back to system-wide hugepages
714	if [ "$numa_nodes" = "0" ]; then
715		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
716		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
717		node="-"
718		huge_size="$HUGEPGSZ"
719
720		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
721	fi
722
723	printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
724		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
725
726	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
727
728	for bdf in "${sorted_bdfs[@]}"; do
729		driver=${drivers_d["$bdf"]}
730		if [ "$numa_nodes" = "0" ]; then
731			node="-"
732		else
733			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
734			if ((node == -1)); then
735				node=unknown
736			fi
737		fi
738		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
739			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
740		else
741			name="-"
742		fi
743
744		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
745			blknames=($(get_block_dev_from_bdf "$bdf"))
746		else
747			blknames=("-")
748		fi
749
750		desc=""
751		desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}}
752		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
753		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
754		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
755		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
756		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
757
758		printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
759			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
760			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
761	done
762}
763
764function status_freebsd() {
765	local pci
766
767	status_print() (
768		local type=$1
769		local dev driver
770
771		shift
772
773		for pci; do
774			printf '%-8s %-15s %-6s %-6s %-16s\n' \
775				"$type" \
776				"$pci" \
777				"${pci_ids_vendor["$pci"]}" \
778				"${pci_ids_device["$pci"]}" \
779				"${pci_bus_driver["$pci"]}"
780		done | sort -k2,2
781	)
782
783	local contigmem=present
784	local contigmem_buffer_size
785	local contigmem_num_buffers
786
787	if ! kldstat -q -m contigmem; then
788		contigmem="not present"
789	fi
790	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
791		contigmem_buffer_size="not set"
792	fi
793	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
794		contigmem_num_buffers="not set"
795	fi
796
797	cat <<- BSD_INFO
798		Contigmem ($contigmem)
799		Buffer Size: $contigmem_buffer_size
800		Num Buffers: $contigmem_num_buffers
801
802	BSD_INFO
803
804	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
805		"Type" "BDF" "Vendor" "Device" "Driver" >&2
806
807	status_print "NVMe" "${!nvme_d[@]}"
808	status_print "I/OAT" "${!ioat_d[@]}"
809	status_print "DSA" "${!dsa_d[@]}"
810	status_print "IAA" "${!iaa_d[@]}"
811	status_print "VMD" "${!vmd_d[@]}"
812}
813
814function configure_freebsd_pci() {
815	local BDFS
816
817	BDFS+=("${!nvme_d[@]}")
818	BDFS+=("${!ioat_d[@]}")
819	BDFS+=("${!dsa_d[@]}")
820	BDFS+=("${!iaa_d[@]}")
821	BDFS+=("${!vmd_d[@]}")
822
823	# Drop the domain part from all the addresses
824	BDFS=("${BDFS[@]#*:}")
825
826	local IFS=","
827	kldunload nic_uio.ko || true
828	kenv hw.nic_uio.bdfs="${BDFS[*]}"
829	kldload nic_uio.ko
830}
831
832function configure_freebsd() {
833	if ! check_for_driver_freebsd; then
834		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
835		return 1
836	fi
837	configure_freebsd_pci
838	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
839	#  previous value, unload contigmem so that we can reload with the new value.
840	if kldstat -q -m contigmem; then
841		# contigmem may be loaded, but the kernel environment doesn't have to
842		# be necessarily set at this point. If it isn't, kenv will fail to
843		# pick up the hw. options. Handle it.
844		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
845			contigmem_num_buffers=-1
846		fi 2> /dev/null
847		if ((contigmem_num_buffers != HUGEMEM / 256)); then
848			kldunload contigmem.ko
849		fi
850	fi
851	if ! kldstat -q -m contigmem; then
852		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
853		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
854		kldload contigmem.ko
855	fi
856}
857
858function reset_freebsd() {
859	kldunload contigmem.ko || true
860	kldunload nic_uio.ko || true
861}
862
863function set_hp() {
864	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
865		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
866		unset -v HUGEPGSZ
867	fi
868
869	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
870	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
871	NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
872}
873
874CMD=reset cache_pci_bus
875
876mode=$1
877
878if [ -z "$mode" ]; then
879	mode="config"
880fi
881
882: ${HUGEMEM:=2048}
883: ${PCI_ALLOWED:=""}
884: ${PCI_BLOCKED:=""}
885
886if [ -n "$NVME_ALLOWED" ]; then
887	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
888fi
889
890if [ -n "$SKIP_PCI" ]; then
891	PCI_ALLOWED="none"
892fi
893
894if [ -z "$TARGET_USER" ]; then
895	TARGET_USER="$SUDO_USER"
896	if [ -z "$TARGET_USER" ]; then
897		TARGET_USER=$(logname 2> /dev/null) || true
898	fi
899fi
900
901collect_devices "$mode"
902
903if [[ $os == Linux ]]; then
904	set_hp
905fi
906
907if [[ $mode == interactive ]]; then
908	source "$rootdir/scripts/common/setup/interactive.sh"
909	main_menu "$2" || exit 0
910fi
911
912if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
913	# Note that this will wait only for the first block device attached to
914	# a given storage controller. For nvme this may miss some of the devs
915	# in case multiple namespaces are being in place.
916	# FIXME: Wait for nvme controller(s) to be in live state and determine
917	# number of configured namespaces, build list of potential block devs
918	# and pass them to sync_dev_uevents. Is it worth the effort?
919	bdfs_to_wait_for=()
920	for bdf in "${!all_devices_d[@]}"; do
921		((all_devices_d["$bdf"] == 0)) || continue
922		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
923			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
924			bdfs_to_wait_for+=("$bdf")
925		fi
926	done
927	if ((${#bdfs_to_wait_for[@]} > 0)); then
928		echo "Waiting for block devices as requested"
929		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
930		"$rootdir/scripts/sync_dev_uevents.sh" \
931			block/disk \
932			"${bdfs_to_wait_for[@]}" &
933		sync_pid=$!
934	fi
935fi
936
937if [[ $os == Linux ]]; then
938	if [ "$mode" == "config" ]; then
939		configure_linux
940	elif [ "$mode" == "cleanup" ]; then
941		cleanup_linux
942		clear_hugepages
943	elif [ "$mode" == "reset" ]; then
944		reset_linux
945	elif [ "$mode" == "status" ]; then
946		status_linux
947	elif [ "$mode" == "help" ]; then
948		usage $0
949	else
950		usage $0 "Invalid argument '$mode'"
951	fi
952else
953	if [ "$mode" == "config" ]; then
954		configure_freebsd
955	elif [ "$mode" == "reset" ]; then
956		reset_freebsd
957	elif [ "$mode" == "cleanup" ]; then
958		echo "setup.sh cleanup function not yet supported on $os"
959	elif [ "$mode" == "status" ]; then
960		status_freebsd
961	elif [ "$mode" == "help" ]; then
962		usage $0
963	else
964		usage $0 "Invalid argument '$mode'"
965	fi
966fi
967
968if [[ -e /proc/$sync_pid/status ]]; then
969	wait "$sync_pid"
970fi
971