xref: /spdk/scripts/setup.sh (revision e2916ec13731949a1e4ffefc148ac43bd7f1f11e)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|interactive|help]"
22	else
23		options="[config|reset|interactive|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "interactive       Executes script in interactive mode."
48	echo "help              Print this help message."
49	echo
50	echo "The following environment variables can be specified."
51	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
52	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
53	echo "                  default."
54	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
55	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
56	echo "                  Uses kernel's default for hugepages size."
57	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
58	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
59	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
60	echo "                  Hugepages can be defined per node with e.g.:"
61	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
62	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
63	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
64	echo "                  setting is used."
65	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
66	echo "                  number of requested hugepages is lower from what's already"
67	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
68	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
69	echo "                  be made prior to allocation".
70	echo "PCI_ALLOWED"
71	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
72	echo "                  Each device must be specified as a full PCI address."
73	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
74	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
75	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
76	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
77	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
78	echo "                  will be bound."
79	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
80	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
81	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
82	echo "                  By default the current user will be used."
83	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
84	echo "                  bind devices to the given driver."
85	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
86	echo "PCI_BLOCK_SYNC_ON_RESET"
87	echo "                  If set in the environment, the attempt to wait for block devices associated"
88	echo "                  with given PCI device will be made upon reset"
89	echo "UNBIND_ENTIRE_IOMMU_GROUP"
90	echo "                  If set, all devices from nvme's iommu group will be unbound from their drivers."
91	echo "                  Use with caution."
92	echo "DEV_TYPE"
93	echo "                  Perform action only against selected type of devices. Supported:"
94	echo "                    IOAT|DSA|IAA|VIRTIO|VMD|NVME."
95	echo "                  Default is to select all types."
96	exit 0
97}
98
99# In monolithic kernels the lsmod won't work. So
100# back that with a /sys/modules. We also check
101# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
102# contain needed info (like in Fedora-like OS).
103function check_for_driver() {
104	if [[ -z $1 ]]; then
105		return 0
106	fi
107
108	if lsmod | grep -q ${1//-/_}; then
109		return 1
110	fi
111
112	if [[ -d /sys/module/${1} || -d \
113		/sys/module/${1//-/_} || -d \
114		/sys/bus/pci/drivers/${1} || -d \
115		/sys/bus/pci/drivers/${1//-/_} ]]; then
116		return 2
117	fi
118	return 0
119}
120
121function check_for_driver_freebsd() {
122	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
123	local search_paths path driver
124	IFS=";" read -ra search_paths < <(kldconfig -rU)
125
126	for driver in contigmem.ko nic_uio.ko; do
127		for path in "${search_paths[@]}"; do
128			[[ -f $path/$driver ]] && continue 2
129		done
130		return 1
131	done
132	return 0
133}
134
135function pci_dev_echo() {
136	local bdf="$1"
137	shift
138	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
139}
140
141function probe_driver() {
142	local bdf=$1
143	local driver_name=$2
144	old_driver_name=${pci_bus_driver["$bdf"]:-no driver}
145
146	if [[ $driver_name == "$old_driver_name" ]]; then
147		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
148		return 0
149	fi
150
151	if [[ $old_driver_name != "no driver" ]]; then
152		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
153	fi
154
155	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
156
157	if [[ $driver_name == "none" ]]; then
158		return 0
159	fi
160
161	local probe_attempts=0
162	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
163	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
164		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
165		sleep 0.5
166	done 2> /dev/null
167
168	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
169
170	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
171		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
172		return 1
173	fi
174}
175
176function linux_bind_driver() {
177	local bdf="$1"
178	local driver_name="$2"
179
180	probe_driver "$bdf" "$driver_name"
181
182	local iommu_group=${pci_iommu_groups["$bdf"]}
183	if [ -e "/dev/vfio/$iommu_group" ]; then
184		if [ -n "$TARGET_USER" ]; then
185			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
186		fi
187	fi
188
189	local iommug=("${!iommu_groups[iommu_group]}")
190	local _bdf _driver
191	if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then
192		pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!"
193		for _bdf in "${iommug[@]}"; do
194			[[ $_bdf == "$bdf" ]] && continue
195			_driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/}
196			if [[ $_driver == "$driver_name" ]]; then
197				continue
198			fi
199			# See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device()
200			pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver:-no driver})"
201			pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound"
202			if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then
203				pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}"
204				pci_bus_driver["${_bdf##*/}"]=$_driver
205				probe_driver "${_bdf##*/}" none
206			fi
207		done
208	fi
209
210}
211
212function linux_unbind_driver() {
213	local bdf="$1"
214	local old_driver_name=${pci_bus_driver["$bdf"]:-no driver}
215
216	if [[ $old_driver_name == "no driver" ]]; then
217		pci_dev_echo "$bdf" "Not bound to any driver"
218		return 0
219	fi
220
221	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
222		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
223		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
224	fi
225
226	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
227}
228
229function linux_hugetlbfs_mounts() {
230	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
231}
232
233function get_block_dev_from_bdf() {
234	local bdf=$1
235	local block blocks=() ctrl sub
236
237	for block in /sys/block/!(nvme*); do
238		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
239			blocks+=("${block##*/}")
240		fi
241	done
242
243	blocks+=($(get_block_dev_from_nvme "$bdf"))
244
245	printf '%s\n' "${blocks[@]}"
246}
247
248function get_block_dev_from_nvme() {
249	local bdf=$1 block ctrl sub
250
251	for ctrl in /sys/class/nvme/nvme*; do
252		[[ -e $ctrl/address && $(< "$ctrl/address") == "$bdf" ]] || continue
253		sub=$(< "$ctrl/subsysnqn") && break
254	done
255
256	[[ -n $sub ]] || return 0
257
258	for block in /sys/block/nvme*; do
259		[[ -e $block/hidden && $(< "$block/hidden") == 1 ]] && continue
260		[[ $(< "$block/device/subsysnqn") == "$sub" ]] && echo "${block##*/}"
261	done
262}
263
264function get_used_bdf_block_devs() {
265	local bdf=$1
266	local blocks block blockp dev mount holder
267	local used
268
269	hash lsblk &> /dev/null || return 1
270	blocks=($(get_block_dev_from_bdf "$bdf"))
271
272	for block in "${blocks[@]}"; do
273		# Check if the device is hold by some other, regardless if it's mounted
274		# or not.
275		for holder in "/sys/class/block/$block"*/holders/*; do
276			[[ -e $holder ]] || continue
277			blockp=${holder%/holders*} blockp=${blockp##*/}
278			if [[ -e $holder/slaves/$blockp ]]; then
279				used+=("holder@$blockp:${holder##*/}")
280			fi
281		done
282		while read -r dev mount; do
283			if [[ -e $mount ]]; then
284				used+=("mount@$block:$dev")
285			fi
286		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
287		if ((${#used[@]} == 0)); then
288			# Make sure we check if there's any valid data present on the target device
289			# regardless if it's being actively used or not. This is mainly done to make
290			# sure we don't miss more complex setups like ZFS pools, etc.
291			if block_in_use "$block" > /dev/null; then
292				used+=("data@$block")
293			fi
294		fi
295	done
296
297	if ((${#used[@]} > 0)); then
298		printf '%s\n' "${used[@]}"
299	fi
300}
301
302function collect_devices() {
303	local mode=$1 in_use
304
305	map_supported_devices "$DEV_TYPE"
306
307	for bdf in "${!all_devices_d[@]}"; do
308		in_use=0
309		if [[ $mode != status ]]; then
310			if ! pci_can_use "$bdf"; then
311				pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
312				in_use=1
313			fi
314		fi
315		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
316			if ! verify_bdf_block_devs "$bdf"; then
317				in_use=1
318			fi
319		fi
320		if [[ -n ${vmd_d["$bdf"]} ]]; then
321			if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
322				pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
323				in_use=1
324			elif ((vmd_nvme_count["$bdf"] > 0)) && [[ $DRIVER_OVERRLDE != none && $mode == config ]]; then
325				cat <<- MESSAGE
326					Binding new driver to VMD device with NVMe SSDs attached to the kernel:
327					  ${!vmd_nvme_d["$bdf"]}
328					The binding process may go faster if you first run this script with
329					DRIVER_OVERRIDE="none" to unbind only the NVMe SSDs, and then run
330					again to unbind the VMD devices.
331				MESSAGE
332			fi
333		fi
334		# Update in-use for each bdf. Default from the map_supported_devices() is 0 == "not used"
335		local -n type_ref=${all_devices_type_d["$bdf"]}_d
336		type_ref["$bdf"]=$in_use
337		all_devices_d["$bdf"]=$in_use
338	done
339
340	# Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are
341	# any skip them since they won't be usable by SPDK without moving the entire VMD ctrl
342	# away from the kernel first. That said, allow to touch the nvmes in case user requested
343	# all devices to be unbound from any driver or if dedicated override flag was set.
344	[[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0
345
346	for bdf in "${!nvme_d[@]}"; do
347		is_nvme_iommu_shared_with_vmd "$bdf" || continue
348		nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1
349		pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})"
350	done
351
352	return 0
353}
354
355function collect_driver() {
356	local bdf=$1
357	local drivers driver
358
359	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
360		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
361		# Pick first entry in case multiple aliases are bound to a driver.
362		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
363		driver=${driver##*/}
364	else
365		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
366		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
367		[[ -n ${dsa_d["$bdf"]} ]] && driver=idxd
368		[[ -n ${iaa_d["$bdf"]} ]] && driver=idxd
369		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
370		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
371	fi 2> /dev/null
372	echo "$driver"
373}
374
375function verify_bdf_block_devs() {
376	local bdf=$1
377	local blknames
378	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
379
380	if ((${#blknames[@]} > 0)); then
381		local IFS=","
382		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
383		return 1
384	fi
385}
386
387function configure_linux_pci() {
388	local driver_path=""
389	driver_name=""
390	igb_uio_fallback=""
391
392	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
393		# igb_uio is a common driver to override with and it depends on uio.
394		modprobe uio || true
395		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
396			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
397		fi
398	fi
399
400	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
401		driver_name=none
402	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
403		driver_path="$DRIVER_OVERRIDE"
404		driver_name="${DRIVER_OVERRIDE##*/}"
405		# modprobe and the sysfs don't use the .ko suffix.
406		driver_name=${driver_name%.ko}
407		# path = name -> there is no path
408		if [[ "$driver_path" = "$driver_name" ]]; then
409			driver_path=""
410		fi
411	elif is_iommu_enabled; then
412		driver_name=vfio-pci
413		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
414		# should be done automatically by modprobe since this particular module should
415		# be a part of vfio-pci dependencies, however, on some distros, it seems that
416		# it's not the case. See #1689.
417		if modinfo vfio_iommu_type1 > /dev/null; then
418			modprobe vfio_iommu_type1
419		fi
420	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
421		driver_name=uio_pci_generic
422	elif [[ -e $igb_uio_fallback ]]; then
423		driver_path="$igb_uio_fallback"
424		driver_name="igb_uio"
425		echo "WARNING: uio_pci_generic not detected - using $driver_name"
426	else
427		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
428		return 1
429	fi
430
431	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
432	if [[ $driver_name != "none" ]]; then
433		if [[ -n "$driver_path" ]]; then
434			insmod $driver_path || true
435		else
436			modprobe $driver_name
437		fi
438	fi
439
440	for bdf in "${!all_devices_d[@]}"; do
441		if ((all_devices_d["$bdf"] == 0)); then
442			if [[ -n ${nvme_d["$bdf"]} ]]; then
443				# Some nvme controllers may take significant amount of time while being
444				# unbound from the driver. Put that task into background to speed up the
445				# whole process. Currently this is done only for the devices bound to the
446				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
447				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
448				linux_bind_driver "$bdf" "$driver_name" &
449			else
450				linux_bind_driver "$bdf" "$driver_name"
451			fi
452		fi
453	done
454	wait
455
456	echo "1" > "/sys/bus/pci/rescan"
457}
458
459function cleanup_linux() {
460	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
461	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
462
463	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
464	if [[ -d $XDG_RUNTIME_DIR ]]; then
465		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
466	fi
467
468	for dir in "${dirs_to_clean[@]}"; do
469		files_to_clean+=("$dir/"*)
470	done
471	file_locks+=(/var/tmp/spdk_pci_lock*)
472	file_locks+=(/var/tmp/spdk_cpu_lock*)
473
474	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
475	files_to_clean+=("${file_locks[@]}")
476
477	# This may fail in case path that readlink attempts to resolve suddenly
478	# disappears (as it may happen with terminating processes).
479	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
480
481	if ((${#opened_files[@]} == 0)); then
482		echo "Can't get list of opened files!"
483		exit 1
484	fi
485
486	echo 'Cleaning'
487	for f in "${files_to_clean[@]}"; do
488		[[ -e $f ]] || continue
489		if [[ ${opened_files[*]} != *"$f"* ]]; then
490			echo "Removing:    $f"
491			rm $f
492		else
493			echo "Still open: $f"
494		fi
495	done
496
497	for dir in "${dirs_to_clean[@]}"; do
498		[[ -d $dir ]] || continue
499		if [[ ${opened_files[*]} != *"$dir"* ]]; then
500			echo "Removing:    $dir"
501			rmdir $dir
502		else
503			echo "Still open: $dir"
504		fi
505	done
506	echo "Clean"
507}
508
509check_hugepages_alloc() {
510	local hp_int=$1
511	local allocated_hugepages
512
513	allocated_hugepages=$(< "$hp_int")
514
515	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
516		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
517		return 0
518	fi
519
520	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
521
522	allocated_hugepages=$(< "$hp_int")
523	if ((allocated_hugepages < NRHUGE)); then
524		cat <<- ERROR
525
526			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
527			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
528		ERROR
529		return 1
530	fi
531}
532
533clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
534
535configure_linux_hugepages() {
536	local node system_nodes
537	local nodes_to_use nodes_hp
538
539	if [[ $CLEAR_HUGE == yes ]]; then
540		clear_hugepages
541	fi
542
543	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
544		clear_hugepages
545		check_hugepages_alloc /proc/sys/vm/nr_hugepages
546		return 0
547	fi
548
549	for node in /sys/devices/system/node/node*; do
550		[[ -e $node ]] || continue
551		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
552	done
553
554	if ((${#nodes[@]} == 0)); then
555		# No NUMA support? Fallback to common interface
556		check_hugepages_alloc /proc/sys/vm/nr_hugepages
557		return 0
558	fi
559
560	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
561	if ((${#nodes_to_use[@]} == 0)); then
562		nodes_to_use[0]=0
563	fi
564
565	# Align indexes with node ids
566	for node in "${!nodes_to_use[@]}"; do
567		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
568			eval "${nodes_to_use[node]}"
569		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
570			nodes_hp[nodes_to_use[node]]=$NRHUGE
571		fi
572	done
573
574	for node in "${!nodes_hp[@]}"; do
575		if [[ -z ${nodes[node]} ]]; then
576			echo "Node $node doesn't exist, ignoring" >&2
577			continue
578		fi
579		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
580	done
581}
582
583function configure_linux() {
584	configure_linux_pci
585	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
586
587	if [ -z "$hugetlbfs_mounts" ]; then
588		hugetlbfs_mounts=/mnt/huge
589		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
590		mkdir -p "$hugetlbfs_mounts"
591		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
592	fi
593
594	configure_linux_hugepages
595
596	if [ "$driver_name" = "vfio-pci" ]; then
597		if [ -n "$TARGET_USER" ]; then
598			for mount in $hugetlbfs_mounts; do
599				chown "$TARGET_USER" "$mount"
600				chmod g+w "$mount"
601			done
602
603			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
604			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
605				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
606				cat <<- MEMLOCK
607					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
608
609					This is the maximum amount of memory you will be
610					able to use with DPDK and VFIO if run as user "$TARGET_USER".
611					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
612				MEMLOCK
613				if ((MEMLOCK_AMNT < 65536)); then
614					echo ""
615					echo "## WARNING: memlock limit is less than 64MB"
616					echo -n "## DPDK with VFIO may not be able to initialize "
617					echo "if run as user \"$TARGET_USER\"."
618				fi
619			fi
620		fi
621	fi
622
623	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
624		# Some distros build msr as a module.  Make sure it's loaded to ensure
625		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
626		#  sleeps.
627		modprobe msr &> /dev/null || true
628	fi
629}
630
631function reset_linux_pci() {
632	# virtio
633	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
634	# Requires some more investigation - for example, some kernels do not seem to have
635	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
636	#  underscore vs. dash right in the virtio_scsi name.
637	modprobe virtio-pci || true
638	for bdf in "${!all_devices_d[@]}"; do
639		((all_devices_d["$bdf"] == 0)) || continue
640
641		driver=$(collect_driver "$bdf")
642		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
643			linux_bind_driver "$bdf" "$driver"
644		else
645			linux_unbind_driver "$bdf"
646		fi
647	done
648
649	echo "1" > "/sys/bus/pci/rescan"
650}
651
652function reset_linux() {
653	reset_linux_pci
654	for mount in $(linux_hugetlbfs_mounts); do
655		for hp in "$mount"/spdk*map_*; do
656			flock -n "$hp" true && rm -f "$hp"
657		done
658	done
659	rm -f /run/.spdk*
660}
661
662function status_linux() {
663	echo "Hugepages" >&2
664	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
665
666	numa_nodes=0
667	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
668		numa_nodes=$((numa_nodes + 1))
669		free_pages=$(cat $path/free_hugepages)
670		all_pages=$(cat $path/nr_hugepages)
671
672		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
673
674		node=${BASH_REMATCH[1]}
675		huge_size=${BASH_REMATCH[2]}
676
677		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
678	done
679
680	# fall back to system-wide hugepages
681	if [ "$numa_nodes" = "0" ]; then
682		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
683		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
684		node="-"
685		huge_size="$HUGEPGSZ"
686
687		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
688	fi
689
690	printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
691		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
692
693	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
694
695	for bdf in "${sorted_bdfs[@]}"; do
696		driver=${pci_bus_driver["$bdf"]}
697		if [ "$numa_nodes" = "0" ]; then
698			node="-"
699		else
700			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
701			if ((node == -1)); then
702				node=unknown
703			fi
704		fi
705		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
706			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
707		else
708			name="-"
709		fi
710
711		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
712			blknames=($(get_block_dev_from_bdf "$bdf"))
713		else
714			blknames=("-")
715		fi
716
717		desc=""
718		desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}}
719		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
720		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
721		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
722		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
723		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
724
725		printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
726			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
727			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
728	done
729}
730
731function status_freebsd() {
732	local pci
733
734	status_print() (
735		local type=$1
736		local dev driver
737
738		shift
739
740		for pci; do
741			printf '%-8s %-15s %-6s %-6s %-16s\n' \
742				"$type" \
743				"$pci" \
744				"${pci_ids_vendor["$pci"]}" \
745				"${pci_ids_device["$pci"]}" \
746				"${pci_bus_driver["$pci"]}"
747		done | sort -k2,2
748	)
749
750	local contigmem=present
751	local contigmem_buffer_size
752	local contigmem_num_buffers
753
754	if ! kldstat -q -m contigmem; then
755		contigmem="not present"
756	fi
757	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
758		contigmem_buffer_size="not set"
759	fi
760	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
761		contigmem_num_buffers="not set"
762	fi
763
764	cat <<- BSD_INFO
765		Contigmem ($contigmem)
766		Buffer Size: $contigmem_buffer_size
767		Num Buffers: $contigmem_num_buffers
768
769	BSD_INFO
770
771	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
772		"Type" "BDF" "Vendor" "Device" "Driver" >&2
773
774	status_print "NVMe" "${!nvme_d[@]}"
775	status_print "I/OAT" "${!ioat_d[@]}"
776	status_print "DSA" "${!dsa_d[@]}"
777	status_print "IAA" "${!iaa_d[@]}"
778	status_print "VMD" "${!vmd_d[@]}"
779}
780
781function configure_freebsd_pci() {
782	local BDFS
783
784	BDFS+=("${!nvme_d[@]}")
785	BDFS+=("${!ioat_d[@]}")
786	BDFS+=("${!dsa_d[@]}")
787	BDFS+=("${!iaa_d[@]}")
788	BDFS+=("${!vmd_d[@]}")
789
790	# Drop the domain part from all the addresses
791	BDFS=("${BDFS[@]#*:}")
792
793	local IFS=","
794	kldunload nic_uio.ko || true
795	kenv hw.nic_uio.bdfs="${BDFS[*]}"
796	kldload nic_uio.ko
797}
798
799function configure_freebsd() {
800	if ! check_for_driver_freebsd; then
801		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
802		return 1
803	fi
804	configure_freebsd_pci
805	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
806	#  previous value, unload contigmem so that we can reload with the new value.
807	if kldstat -q -m contigmem; then
808		# contigmem may be loaded, but the kernel environment doesn't have to
809		# be necessarily set at this point. If it isn't, kenv will fail to
810		# pick up the hw. options. Handle it.
811		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
812			contigmem_num_buffers=-1
813		fi 2> /dev/null
814		if ((contigmem_num_buffers != HUGEMEM / 256)); then
815			kldunload contigmem.ko
816		fi
817	fi
818	if ! kldstat -q -m contigmem; then
819		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
820		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
821		kldload contigmem.ko
822	fi
823}
824
825function reset_freebsd() {
826	kldunload contigmem.ko || true
827	kldunload nic_uio.ko || true
828}
829
830function set_hp() {
831	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
832		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
833		unset -v HUGEPGSZ
834	fi
835
836	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
837	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
838	NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
839}
840
841kmsg "spdk: $0 $* (start)"
842
843CMD=reset cache_pci_bus
844
845mode=$1
846
847if [ -z "$mode" ]; then
848	mode="config"
849fi
850
851: ${HUGEMEM:=2048}
852: ${PCI_ALLOWED:=""}
853: ${PCI_BLOCKED:=""}
854
855if [ -n "$NVME_ALLOWED" ]; then
856	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
857fi
858
859if [ -n "$SKIP_PCI" ]; then
860	PCI_ALLOWED="none"
861fi
862
863if [ -z "$TARGET_USER" ]; then
864	TARGET_USER="$SUDO_USER"
865	if [ -z "$TARGET_USER" ]; then
866		TARGET_USER=$(logname 2> /dev/null) || true
867	fi
868fi
869
870collect_devices "$mode"
871
872if [[ $os == Linux ]]; then
873	set_hp
874fi
875
876if [[ $mode == interactive ]]; then
877	source "$rootdir/scripts/common/setup/interactive.sh"
878	main_menu "$2" || exit 0
879fi
880
881if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
882	# Note that this will wait only for the first block device attached to
883	# a given storage controller. For nvme this may miss some of the devs
884	# in case multiple namespaces are being in place.
885	# FIXME: Wait for nvme controller(s) to be in live state and determine
886	# number of configured namespaces, build list of potential block devs
887	# and pass them to sync_dev_uevents. Is it worth the effort?
888	bdfs_to_wait_for=()
889	for bdf in "${!all_devices_d[@]}"; do
890		((all_devices_d["$bdf"] == 0)) || continue
891		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
892			[[ $(collect_driver "$bdf") != "${pci_bus_driver["$bdf"]}" ]] || continue
893			bdfs_to_wait_for+=("$bdf")
894		fi
895	done
896	if ((${#bdfs_to_wait_for[@]} > 0)); then
897		echo "Waiting for block devices as requested"
898		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
899		"$rootdir/scripts/sync_dev_uevents.sh" \
900			block/disk \
901			"${bdfs_to_wait_for[@]}" &
902		sync_pid=$!
903	fi
904fi
905
906if [[ $os == Linux ]]; then
907	if [ "$mode" == "config" ]; then
908		configure_linux
909	elif [ "$mode" == "cleanup" ]; then
910		cleanup_linux
911		clear_hugepages
912	elif [ "$mode" == "reset" ]; then
913		reset_linux
914	elif [ "$mode" == "status" ]; then
915		status_linux
916	elif [ "$mode" == "help" ]; then
917		usage $0
918	else
919		usage $0 "Invalid argument '$mode'"
920	fi
921else
922	if [ "$mode" == "config" ]; then
923		configure_freebsd
924	elif [ "$mode" == "reset" ]; then
925		reset_freebsd
926	elif [ "$mode" == "cleanup" ]; then
927		echo "setup.sh cleanup function not yet supported on $os"
928	elif [ "$mode" == "status" ]; then
929		status_freebsd
930	elif [ "$mode" == "help" ]; then
931		usage $0
932	else
933		usage $0 "Invalid argument '$mode'"
934	fi
935fi
936
937if [[ -e /proc/$sync_pid/status ]]; then
938	wait "$sync_pid"
939fi
940
941kmsg "spdk: $0 $* (done)"
942