xref: /spdk/scripts/setup.sh (revision b6875e1ce57743f3b1416016b9c624d79a862af9)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|interactive|help]"
22	else
23		options="[config|reset|interactive|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "interactive       Executes script in interactive mode."
48	echo "help              Print this help message."
49	echo
50	echo "The following environment variables can be specified."
51	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
52	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
53	echo "                  default."
54	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
55	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
56	echo "                  Uses kernel's default for hugepages size."
57	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
58	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
59	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
60	echo "                  Hugepages can be defined per node with e.g.:"
61	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
62	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
63	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
64	echo "                  setting is used."
65	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
66	echo "                  number of requested hugepages is lower from what's already"
67	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
68	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
69	echo "                  be made prior to allocation".
70	echo "PCI_ALLOWED"
71	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
72	echo "                  Each device must be specified as a full PCI address."
73	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
74	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
75	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
76	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
77	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
78	echo "                  will be bound."
79	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
80	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
81	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
82	echo "                  By default the current user will be used."
83	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
84	echo "                  bind devices to the given driver."
85	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
86	echo "PCI_BLOCK_SYNC_ON_RESET"
87	echo "                  If set in the environment, the attempt to wait for block devices associated"
88	echo "                  with given PCI device will be made upon reset"
89	echo "UNBIND_ENTIRE_IOMMU_GROUP"
90	echo "                  If set, all devices from nvme's iommu group will be unbound from their drivers."
91	echo "                  Use with caution."
92	echo "DEV_TYPE"
93	echo "                  Perform action only against selected type of devices. Supported:"
94	echo "                    IOAT|DSA|IAA|VIRTIO|VMD|NVME."
95	echo "                  Default is to select all types."
96	echo "FORCE_NIC_UIO_REBIND"
97	echo "                  When set to 'yes', an attempt to reload nic_uio will be made regardless"
98	echo "                  of the kernel environment. Applicable only under FreeBSD."
99	exit 0
100}
101
102# In monolithic kernels the lsmod won't work. So
103# back that with a /sys/modules. We also check
104# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
105# contain needed info (like in Fedora-like OS).
106function check_for_driver() {
107	if [[ -z $1 ]]; then
108		return 0
109	fi
110
111	if lsmod | grep -q ${1//-/_}; then
112		return 1
113	fi
114
115	if [[ -d /sys/module/${1} ||
116		-d /sys/module/${1//-/_} ||
117		-d /sys/bus/pci/drivers/${1} ||
118		-d /sys/bus/pci/drivers/${1//-/_} ]]; then
119		return 2
120	fi
121	return 0
122}
123
124function check_for_driver_freebsd() {
125	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
126	local search_paths path driver
127	IFS=";" read -ra search_paths < <(kldconfig -rU)
128
129	for driver in contigmem.ko nic_uio.ko; do
130		for path in "${search_paths[@]}"; do
131			[[ -f $path/$driver ]] && continue 2
132		done
133		return 1
134	done
135	return 0
136}
137
138function pci_dev_echo() {
139	local bdf="$1"
140	shift
141	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
142}
143
144function probe_driver() {
145	local bdf=$1
146	local driver_name=$2
147	old_driver_name=${pci_bus_driver["$bdf"]:-no driver}
148
149	if [[ $driver_name == "$old_driver_name" ]]; then
150		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
151		return 0
152	fi
153
154	if [[ $old_driver_name != "no driver" ]]; then
155		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
156	fi
157
158	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
159
160	if [[ $driver_name == "none" ]]; then
161		return 0
162	fi
163
164	local probe_attempts=0
165	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
166	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
167		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
168		sleep 0.5
169	done 2> /dev/null
170
171	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
172
173	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
174		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
175		return 1
176	fi
177}
178
179function linux_bind_driver() {
180	local bdf="$1"
181	local driver_name="$2"
182
183	probe_driver "$bdf" "$driver_name"
184
185	local iommu_group=${pci_iommu_groups["$bdf"]}
186	if [ -e "/dev/vfio/$iommu_group" ]; then
187		if [ -n "$TARGET_USER" ]; then
188			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
189		fi
190	fi
191
192	local iommug=("${!iommu_groups[iommu_group]}")
193	local _bdf _driver
194	if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then
195		pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!"
196		for _bdf in "${iommug[@]}"; do
197			[[ $_bdf == "$bdf" ]] && continue
198			_driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/}
199			if [[ $_driver == "$driver_name" ]]; then
200				continue
201			fi
202			# See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device()
203			pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver:-no driver})"
204			pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound"
205			if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then
206				pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}"
207				pci_bus_driver["${_bdf##*/}"]=$_driver
208				probe_driver "${_bdf##*/}" none
209			fi
210		done
211	fi
212
213}
214
215function linux_unbind_driver() {
216	local bdf="$1"
217	local old_driver_name=${pci_bus_driver["$bdf"]:-no driver}
218
219	if [[ $old_driver_name == "no driver" ]]; then
220		pci_dev_echo "$bdf" "Not bound to any driver"
221		return 0
222	fi
223
224	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
225		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
226		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
227	fi
228
229	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
230}
231
232function linux_hugetlbfs_mounts() {
233	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
234}
235
236function get_block_dev_from_bdf() {
237	local bdf=$1
238	local block blocks=() ctrl sub
239
240	for block in /sys/block/!(nvme*); do
241		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
242			blocks+=("${block##*/}")
243		fi
244	done
245
246	blocks+=($(get_block_dev_from_nvme "$bdf"))
247
248	printf '%s\n' "${blocks[@]}"
249}
250
251function get_block_dev_from_nvme() {
252	local bdf=$1 block ctrl sub
253
254	for ctrl in /sys/class/nvme/nvme*; do
255		[[ -e $ctrl/address && $(< "$ctrl/address") == "$bdf" ]] || continue
256		sub=$(< "$ctrl/subsysnqn") && break
257	done
258
259	[[ -n $sub ]] || return 0
260
261	for block in /sys/block/nvme*; do
262		[[ -e $block/hidden && $(< "$block/hidden") == 1 ]] && continue
263		[[ $(< "$block/device/subsysnqn") == "$sub" ]] && echo "${block##*/}"
264	done
265}
266
267function get_used_bdf_block_devs() {
268	local bdf=$1
269	local blocks block blockp dev mount holder
270	local used
271
272	hash lsblk &> /dev/null || return 1
273	blocks=($(get_block_dev_from_bdf "$bdf"))
274
275	for block in "${blocks[@]}"; do
276		# Check if the device is hold by some other, regardless if it's mounted
277		# or not.
278		for holder in "/sys/class/block/$block"*/holders/*; do
279			[[ -e $holder ]] || continue
280			blockp=${holder%/holders*} blockp=${blockp##*/}
281			if [[ -e $holder/slaves/$blockp ]]; then
282				used+=("holder@$blockp:${holder##*/}")
283			fi
284		done
285		while read -r dev mount; do
286			if [[ -e $mount ]]; then
287				used+=("mount@$block:$dev")
288			fi
289		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
290		if ((${#used[@]} == 0)); then
291			# Make sure we check if there's any valid data present on the target device
292			# regardless if it's being actively used or not. This is mainly done to make
293			# sure we don't miss more complex setups like ZFS pools, etc.
294			if block_in_use "$block" > /dev/null; then
295				used+=("data@$block")
296			fi
297		fi
298	done
299
300	if ((${#used[@]} > 0)); then
301		printf '%s\n' "${used[@]}"
302	fi
303}
304
305function collect_devices() {
306	local mode=$1 in_use
307
308	map_supported_devices "$DEV_TYPE"
309
310	for bdf in "${!all_devices_d[@]}"; do
311		in_use=0
312		if [[ $mode != status ]]; then
313			if ! pci_can_use "$bdf"; then
314				pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
315				in_use=1
316			fi
317		fi
318		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
319			if ! verify_bdf_block_devs "$bdf"; then
320				in_use=1
321			fi
322		fi
323		if [[ -n ${vmd_d["$bdf"]} ]]; then
324			if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
325				pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
326				in_use=1
327			elif ((vmd_nvme_count["$bdf"] > 0)) && [[ $DRIVER_OVERRLDE != none && $mode == config ]]; then
328				cat <<- MESSAGE
329					Binding new driver to VMD device with NVMe SSDs attached to the kernel:
330					  ${!vmd_nvme_d["$bdf"]}
331					The binding process may go faster if you first run this script with
332					DRIVER_OVERRIDE="none" to unbind only the NVMe SSDs, and then run
333					again to unbind the VMD devices.
334				MESSAGE
335			fi
336		fi
337		if [[ -n ${dsa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then
338			pci_dev_echo "$bdf" "Skipping not allowed DSA controller at $bdf"
339			in_use=1
340		fi
341		if [[ -n ${iaa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then
342			pci_dev_echo "$bdf" "Skipping not allowed IAA controller at $bdf"
343			in_use=1
344		fi
345		# Update in-use for each bdf. Default from the map_supported_devices() is 0 == "not used"
346		local -n type_ref=${all_devices_type_d["$bdf"]}_d
347		type_ref["$bdf"]=$in_use
348		all_devices_d["$bdf"]=$in_use
349	done
350
351	# Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are
352	# any skip them since they won't be usable by SPDK without moving the entire VMD ctrl
353	# away from the kernel first. That said, allow to touch the nvmes in case user requested
354	# all devices to be unbound from any driver or if dedicated override flag was set.
355	[[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0
356
357	for bdf in "${!nvme_d[@]}"; do
358		is_nvme_iommu_shared_with_vmd "$bdf" || continue
359		nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1
360		pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})"
361	done
362
363	get_unsupported_nic_uio_hw
364
365	return 0
366}
367
368function collect_driver() {
369	local bdf=$1
370	local drivers driver
371
372	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
373		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
374		# Pick first entry in case multiple aliases are bound to a driver.
375		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
376		driver=${driver##*/}
377	else
378		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
379		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
380		[[ -n ${dsa_d["$bdf"]} ]] && driver=idxd
381		[[ -n ${iaa_d["$bdf"]} ]] && driver=idxd
382		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
383		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
384	fi 2> /dev/null
385	echo "$driver"
386}
387
388function verify_bdf_block_devs() {
389	local bdf=$1
390	local blknames
391	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
392
393	if ((${#blknames[@]} > 0)); then
394		local IFS=","
395		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
396		return 1
397	fi
398}
399
400function configure_linux_pci() {
401	local driver_path=""
402	driver_name=""
403	igb_uio_fallback=""
404
405	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
406		# igb_uio is a common driver to override with and it depends on uio.
407		modprobe uio || true
408		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
409			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
410		fi
411	fi
412
413	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
414		driver_name=none
415	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
416		driver_path="$DRIVER_OVERRIDE"
417		driver_name="${DRIVER_OVERRIDE##*/}"
418		# modprobe and the sysfs don't use the .ko suffix.
419		driver_name=${driver_name%.ko}
420		# path = name -> there is no path
421		if [[ "$driver_path" = "$driver_name" ]]; then
422			driver_path=""
423		fi
424	elif is_iommu_enabled; then
425		driver_name=vfio-pci
426		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
427		# should be done automatically by modprobe since this particular module should
428		# be a part of vfio-pci dependencies, however, on some distros, it seems that
429		# it's not the case. See #1689.
430		if modinfo vfio_iommu_type1 > /dev/null; then
431			modprobe vfio_iommu_type1
432		fi
433	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
434		driver_name=uio_pci_generic
435	elif [[ -e $igb_uio_fallback ]]; then
436		driver_path="$igb_uio_fallback"
437		driver_name="igb_uio"
438		echo "WARNING: uio_pci_generic not detected - using $driver_name"
439	else
440		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
441		return 1
442	fi
443
444	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
445	if [[ $driver_name != "none" ]]; then
446		if [[ -n "$driver_path" ]]; then
447			insmod $driver_path || true
448		else
449			modprobe $driver_name
450		fi
451	fi
452
453	for bdf in "${!all_devices_d[@]}"; do
454		if ((all_devices_d["$bdf"] == 0)); then
455			if [[ -n ${nvme_d["$bdf"]} ]]; then
456				# Some nvme controllers may take significant amount of time while being
457				# unbound from the driver. Put that task into background to speed up the
458				# whole process. Currently this is done only for the devices bound to the
459				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
460				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
461				linux_bind_driver "$bdf" "$driver_name" &
462			else
463				linux_bind_driver "$bdf" "$driver_name"
464			fi
465		fi
466	done
467	wait
468
469	echo "1" > "/sys/bus/pci/rescan"
470}
471
472function cleanup_linux() {
473	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
474	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
475
476	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
477	if [[ -d $XDG_RUNTIME_DIR ]]; then
478		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
479	fi
480
481	for dir in "${dirs_to_clean[@]}"; do
482		files_to_clean+=("$dir/"*)
483	done
484	file_locks+=(/var/tmp/spdk_pci_lock*)
485	file_locks+=(/var/tmp/spdk_cpu_lock*)
486
487	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
488	files_to_clean+=("${file_locks[@]}")
489
490	# This may fail in case path that readlink attempts to resolve suddenly
491	# disappears (as it may happen with terminating processes).
492	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
493
494	if ((${#opened_files[@]} == 0)); then
495		echo "Can't get list of opened files!"
496		exit 1
497	fi
498
499	echo 'Cleaning'
500	for f in "${files_to_clean[@]}"; do
501		[[ -e $f ]] || continue
502		if [[ ${opened_files[*]} != *"$f"* ]]; then
503			echo "Removing:    $f"
504			rm $f
505		else
506			echo "Still open: $f"
507		fi
508	done
509
510	for dir in "${dirs_to_clean[@]}"; do
511		[[ -d $dir ]] || continue
512		if [[ ${opened_files[*]} != *"$dir"* ]]; then
513			echo "Removing:    $dir"
514			rmdir $dir
515		else
516			echo "Still open: $dir"
517		fi
518	done
519	echo "Clean"
520}
521
522check_hugepages_alloc() {
523	local hp_int=$1
524	local allocated_hugepages
525
526	allocated_hugepages=$(< "$hp_int")
527
528	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
529		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
530		return 0
531	fi
532
533	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
534
535	allocated_hugepages=$(< "$hp_int")
536	if ((allocated_hugepages < NRHUGE)); then
537		cat <<- ERROR
538
539			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
540			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
541		ERROR
542		return 1
543	fi
544}
545
546clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
547
548configure_linux_hugepages() {
549	local node system_nodes
550	local nodes_to_use nodes_hp
551
552	if [[ $CLEAR_HUGE == yes ]]; then
553		clear_hugepages
554	fi
555
556	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
557		clear_hugepages
558		check_hugepages_alloc /proc/sys/vm/nr_hugepages
559		return 0
560	fi
561
562	for node in /sys/devices/system/node/node*; do
563		[[ -e $node ]] || continue
564		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
565	done
566
567	if ((${#nodes[@]} == 0)); then
568		# No NUMA support? Fallback to common interface
569		check_hugepages_alloc /proc/sys/vm/nr_hugepages
570		return 0
571	fi
572
573	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
574	if ((${#nodes_to_use[@]} == 0)); then
575		nodes_to_use[0]=0
576	fi
577
578	# Align indexes with node ids
579	for node in "${!nodes_to_use[@]}"; do
580		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
581			eval "${nodes_to_use[node]}"
582		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
583			nodes_hp[nodes_to_use[node]]=$NRHUGE
584		fi
585	done
586
587	for node in "${!nodes_hp[@]}"; do
588		if [[ -z ${nodes[node]} ]]; then
589			echo "Node $node doesn't exist, ignoring" >&2
590			continue
591		fi
592		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
593	done
594}
595
596function configure_linux() {
597	configure_linux_pci
598	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
599
600	if [ -z "$hugetlbfs_mounts" ]; then
601		hugetlbfs_mounts=/mnt/huge
602		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
603		mkdir -p "$hugetlbfs_mounts"
604		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
605	fi
606
607	configure_linux_hugepages
608
609	if [ "$driver_name" = "vfio-pci" ]; then
610		if [ -n "$TARGET_USER" ]; then
611			for mount in $hugetlbfs_mounts; do
612				chown "$TARGET_USER" "$mount"
613				chmod g+w "$mount"
614			done
615
616			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
617			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
618				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
619				cat <<- MEMLOCK
620					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
621
622					This is the maximum amount of memory you will be
623					able to use with DPDK and VFIO if run as user "$TARGET_USER".
624					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
625				MEMLOCK
626				if ((MEMLOCK_AMNT < 65536)); then
627					echo ""
628					echo "## WARNING: memlock limit is less than 64MB"
629					echo -n "## DPDK with VFIO may not be able to initialize "
630					echo "if run as user \"$TARGET_USER\"."
631				fi
632			fi
633		fi
634	fi
635
636	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
637		# Some distros build msr as a module.  Make sure it's loaded to ensure
638		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
639		#  sleeps.
640		modprobe msr &> /dev/null || true
641	fi
642}
643
644function reset_linux_pci() {
645	# virtio
646	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
647	# Requires some more investigation - for example, some kernels do not seem to have
648	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
649	#  underscore vs. dash right in the virtio_scsi name.
650	modprobe virtio-pci || true
651	for bdf in "${!all_devices_d[@]}"; do
652		((all_devices_d["$bdf"] == 0)) || continue
653
654		driver=$(collect_driver "$bdf")
655		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
656			linux_bind_driver "$bdf" "$driver"
657		else
658			linux_unbind_driver "$bdf"
659		fi
660	done
661
662	echo "1" > "/sys/bus/pci/rescan"
663}
664
665function reset_linux() {
666	reset_linux_pci
667	for mount in $(linux_hugetlbfs_mounts); do
668		for hp in "$mount"/spdk*map_*; do
669			flock -n "$hp" true && rm -f "$hp"
670		done
671	done
672	rm -f /run/.spdk*
673}
674
675function status_linux() {
676	echo "Hugepages" >&2
677	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
678
679	numa_nodes=0
680	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
681		numa_nodes=$((numa_nodes + 1))
682		free_pages=$(cat $path/free_hugepages)
683		all_pages=$(cat $path/nr_hugepages)
684
685		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
686
687		node=${BASH_REMATCH[1]}
688		huge_size=${BASH_REMATCH[2]}
689
690		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
691	done
692
693	# fall back to system-wide hugepages
694	if [ "$numa_nodes" = "0" ]; then
695		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
696		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
697		node="-"
698		huge_size="$HUGEPGSZ"
699
700		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
701	fi
702
703	printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
704		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
705
706	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
707
708	for bdf in "${sorted_bdfs[@]}"; do
709		driver=${pci_bus_driver["$bdf"]}
710		if [ "$numa_nodes" = "0" ]; then
711			node="-"
712		else
713			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
714			if ((node == -1)); then
715				node=unknown
716			fi
717		fi
718		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
719			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
720		else
721			name="-"
722		fi
723
724		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
725			blknames=($(get_block_dev_from_bdf "$bdf"))
726		else
727			blknames=("-")
728		fi
729
730		desc=""
731		desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}}
732		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
733		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
734		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
735		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
736		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
737
738		printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
739			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
740			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
741	done
742}
743
744function status_freebsd() {
745	local pci
746
747	status_print() (
748		local type=$1
749		local dev driver
750
751		shift
752
753		for pci; do
754			printf '%-8s %-15s %-6s %-6s %-16s\n' \
755				"$type" \
756				"$pci" \
757				"${pci_ids_vendor["$pci"]}" \
758				"${pci_ids_device["$pci"]}" \
759				"${pci_bus_driver["$pci"]}"
760		done | sort -k2,2
761	)
762
763	local contigmem=present
764	local contigmem_buffer_size
765	local contigmem_num_buffers
766
767	if ! kldstat -q -m contigmem; then
768		contigmem="not present"
769	fi
770	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
771		contigmem_buffer_size="not set"
772	fi
773	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
774		contigmem_num_buffers="not set"
775	fi
776
777	cat <<- BSD_INFO
778		Contigmem ($contigmem)
779		Buffer Size: $contigmem_buffer_size
780		Num Buffers: $contigmem_num_buffers
781
782	BSD_INFO
783
784	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
785		"Type" "BDF" "Vendor" "Device" "Driver" >&2
786
787	status_print "NVMe" "${!nvme_d[@]}"
788	status_print "I/OAT" "${!ioat_d[@]}"
789	status_print "DSA" "${!dsa_d[@]}"
790	status_print "IAA" "${!iaa_d[@]}"
791	status_print "VMD" "${!vmd_d[@]}"
792}
793
794function configure_freebsd_pci() {
795	local BDFS
796
797	BDFS+=("$@")
798
799	if ((${#unsupported_nic_uio_hw[@]} > 0)) && [[ $FORCE_NIC_UIO_REBIND != yes ]]; then
800		warn_unsupported_nic_uio_hw
801		return 1
802	fi
803
804	BDFS+=("${unsupported_nic_uio_hw[@]}")
805
806	if kldstat -n nic_uio &> /dev/null; then
807		kldunload nic_uio.ko
808	fi
809
810	local IFS=","
811	kenv hw.nic_uio.bdfs="${BDFS[*]}"
812	kldload nic_uio.ko
813}
814
815function get_unsupported_nic_uio_hw() {
816	local bdfs bdf all_devices
817	local -g unsupported_nic_uio_hw
818
819	IFS="," read -ra bdfs < <(kenv hw.nic_uio.bdfs 2> /dev/null) || return 0
820
821	for bdf in "${bdfs[@]}"; do
822		grep -q "$bdf" <(printf '%s\n' "${!all_devices_d[@]}") || unsupported_nic_uio_hw+=("$bdf")
823	done
824
825	return 0
826}
827
828function warn_unsupported_nic_uio_hw() {
829	cat <<- NIC_UIO
830
831		WARNING: Unsupported devices detected in the nic_uio setup:
832
833		$(printf '  %s\n' "${unsupported_nic_uio_hw[@]}")
834
835		Remove them first or pass FORCE_NIC_UIO_REBIND=yes through the environment.
836
837	NIC_UIO
838}
839
840function configure_freebsd() {
841	_configure_freebsd "${!nvme_d[@]}" "${!ioat_d[@]}" "${!dsa_d[@]}" "${!iaa_d[@]}" "${!vmd_d[@]}"
842}
843
844function _configure_freebsd() {
845	if ! check_for_driver_freebsd; then
846		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
847		return 1
848	fi
849	configure_freebsd_pci "$@"
850	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
851	#  previous value, unload contigmem so that we can reload with the new value.
852	if kldstat -q -m contigmem; then
853		# contigmem may be loaded, but the kernel environment doesn't have to
854		# be necessarily set at this point. If it isn't, kenv will fail to
855		# pick up the hw. options. Handle it.
856		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
857			contigmem_num_buffers=-1
858		fi 2> /dev/null
859		if ((contigmem_num_buffers != HUGEMEM / 256)); then
860			kldunload contigmem.ko
861		fi
862	fi
863	if ! kldstat -q -m contigmem; then
864		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
865		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
866		kldload contigmem.ko
867	fi
868}
869
870function reset_freebsd() {
871	# Don't reap the entire nic_uio setup in case there are unsupported devices in the kernel env
872	if ((${#unsupported_nic_uio_hw[@]} > 0)) && [[ $FORCE_NIC_UIO_REBIND != yes ]]; then
873		warn_unsupported_nic_uio_hw
874		return 1
875	fi
876
877	kldunload contigmem.ko || true
878	kldunload nic_uio.ko || true
879
880	if ((${#unsupported_nic_uio_hw[@]} > 0)); then
881		# HACK: try to be nice and recreate the setup but only with the unsupported devices
882		_unsupported_nic_uio_hw=("${unsupported_nic_uio_hw[@]}") unsupported_nic_uio_hw=()
883		_configure_freebsd "${_unsupported_nic_uio_hw[@]}"
884	fi
885}
886
887function set_hp() {
888	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
889		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
890		unset -v HUGEPGSZ
891	fi
892
893	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
894	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
895	NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
896}
897
898kmsg "spdk: $0 $* (start)"
899
900CMD=reset cache_pci_bus
901
902mode=$1
903
904if [ -z "$mode" ]; then
905	mode="config"
906fi
907
908: ${HUGEMEM:=2048}
909: ${PCI_ALLOWED:=""}
910: ${PCI_BLOCKED:=""}
911
912if [ -n "$NVME_ALLOWED" ]; then
913	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
914fi
915
916if [ -n "$SKIP_PCI" ]; then
917	PCI_ALLOWED="none"
918fi
919
920if [ -z "$TARGET_USER" ]; then
921	TARGET_USER="$SUDO_USER"
922	if [ -z "$TARGET_USER" ]; then
923		TARGET_USER=$(logname 2> /dev/null) || true
924	fi
925fi
926
927collect_devices "$mode"
928
929if [[ $os == Linux ]]; then
930	set_hp
931fi
932
933if [[ $mode == interactive ]]; then
934	source "$rootdir/scripts/common/setup/interactive.sh"
935	main_menu "$2" || exit 0
936fi
937
938if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
939	# Note that this will wait only for the first block device attached to
940	# a given storage controller. For nvme this may miss some of the devs
941	# in case multiple namespaces are being in place.
942	# FIXME: Wait for nvme controller(s) to be in live state and determine
943	# number of configured namespaces, build list of potential block devs
944	# and pass them to sync_dev_uevents. Is it worth the effort?
945	bdfs_to_wait_for=()
946	for bdf in "${!all_devices_d[@]}"; do
947		((all_devices_d["$bdf"] == 0)) || continue
948		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
949			[[ $(collect_driver "$bdf") != "${pci_bus_driver["$bdf"]}" ]] || continue
950			bdfs_to_wait_for+=("$bdf")
951		fi
952	done
953	if ((${#bdfs_to_wait_for[@]} > 0)); then
954		echo "Waiting for block devices as requested"
955		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
956		"$rootdir/scripts/sync_dev_uevents.sh" \
957			block/disk \
958			"${bdfs_to_wait_for[@]}" &
959		sync_pid=$!
960	fi
961fi
962
963if [[ $os == Linux ]]; then
964	if [ "$mode" == "config" ]; then
965		configure_linux
966	elif [ "$mode" == "cleanup" ]; then
967		cleanup_linux
968		clear_hugepages
969	elif [ "$mode" == "reset" ]; then
970		reset_linux
971	elif [ "$mode" == "status" ]; then
972		status_linux
973	elif [ "$mode" == "help" ]; then
974		usage $0
975	else
976		usage $0 "Invalid argument '$mode'"
977	fi
978else
979	if [ "$mode" == "config" ]; then
980		configure_freebsd
981	elif [ "$mode" == "reset" ]; then
982		reset_freebsd
983	elif [ "$mode" == "cleanup" ]; then
984		echo "setup.sh cleanup function not yet supported on $os"
985	elif [ "$mode" == "status" ]; then
986		status_freebsd
987	elif [ "$mode" == "help" ]; then
988		usage $0
989	else
990		usage $0 "Invalid argument '$mode'"
991	fi
992fi
993
994if [[ -e /proc/$sync_pid/status ]]; then
995	wait "$sync_pid"
996fi
997
998kmsg "spdk: $0 $* (done)"
999