xref: /spdk/scripts/setup.sh (revision cb7af50cfdf0f6fa3c3b3755969b8d386cbaca34)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|interactive|help]"
22	else
23		options="[config|reset|interactive|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "interactive       Executes script in interactive mode."
48	echo "help              Print this help message."
49	echo
50	echo "The following environment variables can be specified."
51	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
52	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
53	echo "                  default."
54	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
55	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
56	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
57	echo "                  Hugepages can be defined per node with e.g.:"
58	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
59	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
60	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
61	echo "                  setting is used."
62	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
63	echo "                  number of requested hugepages is lower from what's already"
64	echo "                  allocated."
65	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
66	echo "                  be made prior to allocation".
67	echo "PCI_ALLOWED"
68	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
69	echo "                  Each device must be specified as a full PCI address."
70	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
71	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
72	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
73	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
74	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
75	echo "                  will be bound."
76	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
77	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
78	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
79	echo "                  By default the current user will be used."
80	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
81	echo "                  bind devices to the given driver."
82	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
83	echo "PCI_BLOCK_SYNC_ON_RESET"
84	echo "                  If set in the environment, the attempt to wait for block devices associated"
85	echo "                  with given PCI device will be made upon reset"
86	echo "UNBIND_ENTIRE_IOMMU_GROUP"
87	echo "                  If set, all devices from nvme's iommu group will be unbound from their drivers."
88	echo "                  Use with caution."
89	echo "DEV_TYPE"
90	echo "                  Perform action only against selected type of devices. Supported:"
91	echo "                    IOAT|DSA|IAA|VIRTIO|VMD|NVME."
92	echo "                  Default is to select all types."
93	echo "FORCE_NIC_UIO_REBIND"
94	echo "                  When set to 'yes', an attempt to reload nic_uio will be made regardless"
95	echo "                  of the kernel environment. Applicable only under FreeBSD."
96	exit 0
97}
98
99# In monolithic kernels the lsmod won't work. So
100# back that with a /sys/modules. We also check
101# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
102# contain needed info (like in Fedora-like OS).
103function check_for_driver() {
104	if [[ -z $1 ]]; then
105		return 0
106	fi
107
108	if lsmod | grep -q ${1//-/_}; then
109		return 1
110	fi
111
112	if [[ -d /sys/module/${1} ||
113		-d /sys/module/${1//-/_} ||
114		-d /sys/bus/pci/drivers/${1} ||
115		-d /sys/bus/pci/drivers/${1//-/_} ]]; then
116		return 2
117	fi
118	return 0
119}
120
121function check_for_driver_freebsd() {
122	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
123	local search_paths path driver
124	IFS=";" read -ra search_paths < <(kldconfig -rU)
125
126	for driver in contigmem.ko nic_uio.ko; do
127		for path in "${search_paths[@]}"; do
128			[[ -f $path/$driver ]] && continue 2
129		done
130		return 1
131	done
132	return 0
133}
134
135function pci_dev_echo() {
136	local bdf="$1"
137	shift
138	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
139}
140
141function probe_driver() {
142	local bdf=$1
143	local driver_name=$2
144	old_driver_name=${pci_bus_driver["$bdf"]:-no driver}
145
146	if [[ $driver_name == "$old_driver_name" ]]; then
147		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
148		return 0
149	fi
150
151	if [[ $old_driver_name != "no driver" ]]; then
152		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
153	fi
154
155	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
156
157	if [[ $driver_name == "none" ]]; then
158		return 0
159	fi
160
161	local probe_attempts=0
162	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
163	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
164		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
165		sleep 0.5
166	done 2> /dev/null
167
168	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
169
170	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
171		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
172		return 1
173	fi
174}
175
176function linux_bind_driver() {
177	local bdf="$1"
178	local driver_name="$2"
179
180	probe_driver "$bdf" "$driver_name"
181
182	local iommu_group=${pci_iommu_groups["$bdf"]}
183	if [ -e "/dev/vfio/$iommu_group" ]; then
184		if [ -n "$TARGET_USER" ]; then
185			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
186		fi
187	fi
188
189	local iommug=("${!iommu_groups[iommu_group]}")
190	local _bdf _driver
191	if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then
192		pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!"
193		for _bdf in "${iommug[@]}"; do
194			[[ $_bdf == "$bdf" ]] && continue
195			_driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/}
196			if [[ $_driver == "$driver_name" ]]; then
197				continue
198			fi
199			# See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device()
200			pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver:-no driver})"
201			pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound"
202			if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then
203				pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}"
204				pci_bus_driver["${_bdf##*/}"]=$_driver
205				probe_driver "${_bdf##*/}" none
206			fi
207		done
208	fi
209
210}
211
212function linux_unbind_driver() {
213	local bdf="$1"
214	local old_driver_name=${pci_bus_driver["$bdf"]:-no driver}
215
216	if [[ $old_driver_name == "no driver" ]]; then
217		pci_dev_echo "$bdf" "Not bound to any driver"
218		return 0
219	fi
220
221	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
222		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
223		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
224	fi
225
226	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
227}
228
229function linux_hugetlbfs_mounts() {
230	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
231}
232
233function get_used_bdf_block_devs() {
234	local bdf=$1
235	local blocks block blockp dev mount holder
236	local used
237
238	hash lsblk &> /dev/null || return 1
239	blocks=($(get_block_dev_from_bdf "$bdf"))
240
241	for block in "${blocks[@]}"; do
242		# Check if the device is hold by some other, regardless if it's mounted
243		# or not.
244		for holder in "/sys/class/block/$block"*/holders/*; do
245			[[ -e $holder ]] || continue
246			blockp=${holder%/holders*} blockp=${blockp##*/}
247			if [[ -e $holder/slaves/$blockp ]]; then
248				used+=("holder@$blockp:${holder##*/}")
249			fi
250		done
251		while read -r dev mount; do
252			if [[ -e $mount ]]; then
253				used+=("mount@$block:$dev")
254			fi
255		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
256		if ((${#used[@]} == 0)); then
257			# Make sure we check if there's any valid data present on the target device
258			# regardless if it's being actively used or not. This is mainly done to make
259			# sure we don't miss more complex setups like ZFS pools, etc.
260			if block_in_use "$block" > /dev/null; then
261				used+=("data@$block")
262			fi
263		fi
264	done
265
266	if ((${#used[@]} > 0)); then
267		printf '%s\n' "${used[@]}"
268	fi
269}
270
271function collect_devices() {
272	local mode=$1 in_use
273
274	map_supported_devices "$DEV_TYPE"
275
276	for bdf in "${!all_devices_d[@]}"; do
277		in_use=0
278		if [[ $mode != status ]]; then
279			if ! pci_can_use "$bdf"; then
280				pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
281				in_use=1
282			fi
283		fi
284		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
285			if ! verify_bdf_block_devs "$bdf"; then
286				in_use=1
287			fi
288		fi
289		if [[ -n ${vmd_d["$bdf"]} ]]; then
290			if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
291				pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
292				in_use=1
293			elif ((vmd_nvme_count["$bdf"] > 0)) && [[ $DRIVER_OVERRLDE != none && $mode == config ]]; then
294				cat <<- MESSAGE
295					Binding new driver to VMD device with NVMe SSDs attached to the kernel:
296					  ${!vmd_nvme_d["$bdf"]}
297					The binding process may go faster if you first run this script with
298					DRIVER_OVERRIDE="none" to unbind only the NVMe SSDs, and then run
299					again to unbind the VMD devices.
300				MESSAGE
301			fi
302		fi
303		if [[ -n ${dsa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then
304			pci_dev_echo "$bdf" "Skipping not allowed DSA controller at $bdf"
305			in_use=1
306		fi
307		if [[ -n ${iaa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then
308			pci_dev_echo "$bdf" "Skipping not allowed IAA controller at $bdf"
309			in_use=1
310		fi
311		# Update in-use for each bdf. Default from the map_supported_devices() is 0 == "not used"
312		local -n type_ref=${all_devices_type_d["$bdf"]}_d
313		type_ref["$bdf"]=$in_use
314		all_devices_d["$bdf"]=$in_use
315	done
316
317	# Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are
318	# any skip them since they won't be usable by SPDK without moving the entire VMD ctrl
319	# away from the kernel first. That said, allow to touch the nvmes in case user requested
320	# all devices to be unbound from any driver or if dedicated override flag was set.
321	[[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0
322
323	for bdf in "${!nvme_d[@]}"; do
324		is_nvme_iommu_shared_with_vmd "$bdf" || continue
325		nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1
326		pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})"
327	done
328
329	get_unsupported_nic_uio_hw
330
331	return 0
332}
333
334function collect_driver() {
335	local bdf=$1
336	local drivers driver
337
338	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
339		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
340		# Pick first entry in case multiple aliases are bound to a driver.
341		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
342		driver=${driver##*/}
343	else
344		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
345		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
346		[[ -n ${dsa_d["$bdf"]} ]] && driver=idxd
347		[[ -n ${iaa_d["$bdf"]} ]] && driver=idxd
348		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
349		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
350	fi 2> /dev/null
351	echo "$driver"
352}
353
354function verify_bdf_block_devs() {
355	local bdf=$1
356	local blknames
357	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
358
359	if ((${#blknames[@]} > 0)); then
360		local IFS=","
361		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
362		return 1
363	fi
364}
365
366function configure_linux_pci() {
367	local driver_path=""
368	driver_name=""
369	igb_uio_fallback=""
370
371	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
372		# igb_uio is a common driver to override with and it depends on uio.
373		modprobe uio || true
374		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
375			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
376		fi
377	fi
378
379	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
380		driver_name=none
381	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
382		driver_path="$DRIVER_OVERRIDE"
383		driver_name="${DRIVER_OVERRIDE##*/}"
384		# modprobe and the sysfs don't use the .ko suffix.
385		driver_name=${driver_name%.ko}
386		# path = name -> there is no path
387		if [[ "$driver_path" = "$driver_name" ]]; then
388			driver_path=""
389		fi
390	elif is_iommu_enabled; then
391		driver_name=vfio-pci
392		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
393		# should be done automatically by modprobe since this particular module should
394		# be a part of vfio-pci dependencies, however, on some distros, it seems that
395		# it's not the case. See #1689.
396		if modinfo vfio_iommu_type1 > /dev/null; then
397			modprobe vfio_iommu_type1
398		fi
399	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
400		driver_name=uio_pci_generic
401	elif [[ -e $igb_uio_fallback ]]; then
402		driver_path="$igb_uio_fallback"
403		driver_name="igb_uio"
404		echo "WARNING: uio_pci_generic not detected - using $driver_name"
405	else
406		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
407		return 1
408	fi
409
410	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
411	if [[ $driver_name != "none" ]]; then
412		if [[ -n "$driver_path" ]]; then
413			insmod $driver_path || true
414		else
415			modprobe $driver_name
416		fi
417	fi
418
419	for bdf in "${!all_devices_d[@]}"; do
420		if ((all_devices_d["$bdf"] == 0)); then
421			if [[ -n ${nvme_d["$bdf"]} ]]; then
422				# Some nvme controllers may take significant amount of time while being
423				# unbound from the driver. Put that task into background to speed up the
424				# whole process. Currently this is done only for the devices bound to the
425				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
426				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
427				linux_bind_driver "$bdf" "$driver_name" &
428			else
429				linux_bind_driver "$bdf" "$driver_name"
430			fi
431		fi
432	done
433	wait
434
435	echo "1" > "/sys/bus/pci/rescan"
436}
437
438function cleanup_linux() {
439	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
440	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
441
442	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
443	if [[ -d $XDG_RUNTIME_DIR ]]; then
444		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
445	fi
446
447	for dir in "${dirs_to_clean[@]}"; do
448		files_to_clean+=("$dir/"*)
449	done
450	file_locks+=(/var/tmp/spdk_pci_lock*)
451	file_locks+=(/var/tmp/spdk_cpu_lock*)
452
453	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
454	files_to_clean+=("${file_locks[@]}")
455
456	# This may fail in case path that readlink attempts to resolve suddenly
457	# disappears (as it may happen with terminating processes).
458	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
459
460	if ((${#opened_files[@]} == 0)); then
461		echo "Can't get list of opened files!"
462		exit 1
463	fi
464
465	echo 'Cleaning'
466	for f in "${files_to_clean[@]}"; do
467		[[ -e $f ]] || continue
468		if [[ ${opened_files[*]} != *"$f"* ]]; then
469			echo "Removing:    $f"
470			rm $f
471		else
472			echo "Still open: $f"
473		fi
474	done
475
476	for dir in "${dirs_to_clean[@]}"; do
477		[[ -d $dir ]] || continue
478		if [[ ${opened_files[*]} != *"$dir"* ]]; then
479			echo "Removing:    $dir"
480			rmdir $dir
481		else
482			echo "Still open: $dir"
483		fi
484	done
485	echo "Clean"
486}
487
488check_hugepages_alloc() {
489	local hp_int=$1
490	local allocated_hugepages
491
492	allocated_hugepages=$(< "$hp_int")
493
494	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
495		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
496		return 0
497	fi
498
499	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
500
501	allocated_hugepages=$(< "$hp_int")
502	if ((allocated_hugepages < NRHUGE)); then
503		cat <<- ERROR
504
505			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
506			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
507		ERROR
508		return 1
509	fi
510}
511
512clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
513
514configure_linux_hugepages() {
515	local node system_nodes
516	local nodes_to_use nodes_hp
517
518	if [[ $CLEAR_HUGE == yes ]]; then
519		clear_hugepages
520	fi
521
522	if [[ -z $HUGENODE ]]; then
523		check_hugepages_alloc /proc/sys/vm/nr_hugepages
524		return 0
525	fi
526
527	for node in /sys/devices/system/node/node*; do
528		[[ -e $node ]] || continue
529		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
530	done
531
532	if ((${#nodes[@]} == 0)); then
533		# No NUMA support? Fallback to common interface
534		check_hugepages_alloc /proc/sys/vm/nr_hugepages
535		return 0
536	fi
537
538	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
539	if ((${#nodes_to_use[@]} == 0)); then
540		nodes_to_use[0]=0
541	fi
542
543	# Align indexes with node ids
544	for node in "${!nodes_to_use[@]}"; do
545		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
546			eval "${nodes_to_use[node]}"
547		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
548			nodes_hp[nodes_to_use[node]]=$NRHUGE
549		fi
550	done
551
552	for node in "${!nodes_hp[@]}"; do
553		if [[ -z ${nodes[node]} ]]; then
554			echo "Node $node doesn't exist, ignoring" >&2
555			continue
556		fi
557		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
558	done
559}
560
561function configure_linux() {
562	configure_linux_pci
563	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
564
565	if [ -z "$hugetlbfs_mounts" ]; then
566		hugetlbfs_mounts=/mnt/huge
567		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
568		mkdir -p "$hugetlbfs_mounts"
569		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
570	fi
571
572	configure_linux_hugepages
573
574	if [ "$driver_name" = "vfio-pci" ]; then
575		if [ -n "$TARGET_USER" ]; then
576			for mount in $hugetlbfs_mounts; do
577				chown "$TARGET_USER" "$mount"
578				chmod g+w "$mount"
579			done
580
581			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
582			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
583				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
584				cat <<- MEMLOCK
585					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
586
587					This is the maximum amount of memory you will be
588					able to use with DPDK and VFIO if run as user "$TARGET_USER".
589					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
590				MEMLOCK
591				if ((MEMLOCK_AMNT < 65536)); then
592					echo ""
593					echo "## WARNING: memlock limit is less than 64MB"
594					echo -n "## DPDK with VFIO may not be able to initialize "
595					echo "if run as user \"$TARGET_USER\"."
596				fi
597			fi
598		fi
599	fi
600
601	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
602		# Some distros build msr as a module.  Make sure it's loaded to ensure
603		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
604		#  sleeps.
605		modprobe msr &> /dev/null || true
606	fi
607}
608
609function reset_linux_pci() {
610	# virtio
611	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
612	# Requires some more investigation - for example, some kernels do not seem to have
613	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
614	#  underscore vs. dash right in the virtio_scsi name.
615	modprobe virtio-pci || true
616	for bdf in "${!all_devices_d[@]}"; do
617		((all_devices_d["$bdf"] == 0)) || continue
618
619		driver=$(collect_driver "$bdf")
620		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
621			linux_bind_driver "$bdf" "$driver"
622		else
623			linux_unbind_driver "$bdf"
624		fi
625	done
626
627	echo "1" > "/sys/bus/pci/rescan"
628}
629
630function reset_linux() {
631	reset_linux_pci
632	for mount in $(linux_hugetlbfs_mounts); do
633		for hp in "$mount"/spdk*map_*; do
634			flock -n "$hp" true && rm -f "$hp"
635		done
636	done
637	rm -f /run/.spdk*
638}
639
640function status_linux() {
641	echo "Hugepages" >&2
642	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
643
644	numa_nodes=0
645	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
646		numa_nodes=$((numa_nodes + 1))
647		free_pages=$(cat $path/free_hugepages)
648		all_pages=$(cat $path/nr_hugepages)
649
650		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
651
652		node=${BASH_REMATCH[1]}
653		huge_size=${BASH_REMATCH[2]}
654
655		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
656	done
657
658	# fall back to system-wide hugepages
659	if [ "$numa_nodes" = "0" ]; then
660		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
661		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
662		node="-"
663		huge_size="$HUGEPGSZ"
664
665		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
666	fi
667
668	printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
669		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
670
671	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
672
673	for bdf in "${sorted_bdfs[@]}"; do
674		driver=${pci_bus_driver["$bdf"]}
675		if [ "$numa_nodes" = "0" ]; then
676			node="-"
677		else
678			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
679			if ((node == -1)); then
680				node=unknown
681			fi
682		fi
683		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
684			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
685		else
686			name="-"
687		fi
688
689		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
690			blknames=($(get_block_dev_from_bdf "$bdf"))
691		else
692			blknames=("-")
693		fi
694
695		desc=""
696		desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}}
697		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
698		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
699		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
700		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
701		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
702
703		printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
704			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
705			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
706	done
707}
708
709function status_freebsd() {
710	local pci
711
712	status_print() (
713		local type=$1
714		local dev driver
715
716		shift
717
718		for pci; do
719			printf '%-8s %-15s %-6s %-6s %-16s\n' \
720				"$type" \
721				"$pci" \
722				"${pci_ids_vendor["$pci"]}" \
723				"${pci_ids_device["$pci"]}" \
724				"${pci_bus_driver["$pci"]}"
725		done | sort -k2,2
726	)
727
728	local contigmem=present
729	local contigmem_buffer_size
730	local contigmem_num_buffers
731
732	if ! kldstat -q -m contigmem; then
733		contigmem="not present"
734	fi
735	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
736		contigmem_buffer_size="not set"
737	fi
738	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
739		contigmem_num_buffers="not set"
740	fi
741
742	cat <<- BSD_INFO
743		Contigmem ($contigmem)
744		Buffer Size: $contigmem_buffer_size
745		Num Buffers: $contigmem_num_buffers
746
747	BSD_INFO
748
749	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
750		"Type" "BDF" "Vendor" "Device" "Driver" >&2
751
752	status_print "NVMe" "${!nvme_d[@]}"
753	status_print "I/OAT" "${!ioat_d[@]}"
754	status_print "DSA" "${!dsa_d[@]}"
755	status_print "IAA" "${!iaa_d[@]}"
756	status_print "VMD" "${!vmd_d[@]}"
757}
758
759function configure_freebsd_pci() {
760	local BDFS
761
762	BDFS+=("$@")
763
764	if ((${#unsupported_nic_uio_hw[@]} > 0)) && [[ $FORCE_NIC_UIO_REBIND != yes ]]; then
765		warn_unsupported_nic_uio_hw
766		return 1
767	fi
768
769	BDFS+=("${unsupported_nic_uio_hw[@]}")
770
771	if kldstat -n nic_uio &> /dev/null; then
772		kldunload nic_uio.ko
773	fi
774
775	local IFS=","
776	kenv hw.nic_uio.bdfs="${BDFS[*]}"
777	kldload nic_uio.ko
778}
779
780function get_unsupported_nic_uio_hw() {
781	local bdfs bdf all_devices
782	local -g unsupported_nic_uio_hw
783
784	IFS="," read -ra bdfs < <(kenv hw.nic_uio.bdfs 2> /dev/null) || return 0
785
786	for bdf in "${bdfs[@]}"; do
787		grep -q "$bdf" <(printf '%s\n' "${!all_devices_d[@]}") || unsupported_nic_uio_hw+=("$bdf")
788	done
789
790	return 0
791}
792
793function warn_unsupported_nic_uio_hw() {
794	cat <<- NIC_UIO
795
796		WARNING: Unsupported devices detected in the nic_uio setup:
797
798		$(printf '  %s\n' "${unsupported_nic_uio_hw[@]}")
799
800		Remove them first or pass FORCE_NIC_UIO_REBIND=yes through the environment.
801
802	NIC_UIO
803}
804
805function configure_freebsd() {
806	_configure_freebsd "${!nvme_d[@]}" "${!ioat_d[@]}" "${!dsa_d[@]}" "${!iaa_d[@]}" "${!vmd_d[@]}"
807}
808
809function _configure_freebsd() {
810	if ! check_for_driver_freebsd; then
811		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
812		return 1
813	fi
814	configure_freebsd_pci "$@"
815	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
816	#  previous value, unload contigmem so that we can reload with the new value.
817	if kldstat -q -m contigmem; then
818		# contigmem may be loaded, but the kernel environment doesn't have to
819		# be necessarily set at this point. If it isn't, kenv will fail to
820		# pick up the hw. options. Handle it.
821		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
822			contigmem_num_buffers=-1
823		fi 2> /dev/null
824		if ((contigmem_num_buffers != HUGEMEM / 256)); then
825			kldunload contigmem.ko
826		fi
827	fi
828	if ! kldstat -q -m contigmem; then
829		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
830		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
831		kldload contigmem.ko
832	fi
833}
834
835function reset_freebsd() {
836	# Don't reap the entire nic_uio setup in case there are unsupported devices in the kernel env
837	if ((${#unsupported_nic_uio_hw[@]} > 0)) && [[ $FORCE_NIC_UIO_REBIND != yes ]]; then
838		warn_unsupported_nic_uio_hw
839		return 1
840	fi
841
842	kldunload contigmem.ko || true
843	kldunload nic_uio.ko || true
844
845	if ((${#unsupported_nic_uio_hw[@]} > 0)); then
846		# HACK: try to be nice and recreate the setup but only with the unsupported devices
847		_unsupported_nic_uio_hw=("${unsupported_nic_uio_hw[@]}") unsupported_nic_uio_hw=()
848		_configure_freebsd "${_unsupported_nic_uio_hw[@]}"
849	fi
850}
851
852function set_hp() {
853	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
854		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
855		unset -v HUGEPGSZ
856	fi
857
858	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
859	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
860	NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
861}
862
863kmsg "spdk: $0 $* (start)"
864
865CMD=reset cache_pci_bus
866
867mode=$1
868
869if [ -z "$mode" ]; then
870	mode="config"
871fi
872
873: ${HUGEMEM:=2048}
874: ${PCI_ALLOWED:=""}
875: ${PCI_BLOCKED:=""}
876
877if [ -n "$NVME_ALLOWED" ]; then
878	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
879fi
880
881if [ -n "$SKIP_PCI" ]; then
882	PCI_ALLOWED="none"
883fi
884
885if [ -z "$TARGET_USER" ]; then
886	TARGET_USER="$SUDO_USER"
887	if [ -z "$TARGET_USER" ]; then
888		TARGET_USER=$(logname 2> /dev/null) || true
889	fi
890fi
891
892collect_devices "$mode"
893
894if [[ $os == Linux ]]; then
895	set_hp
896fi
897
898if [[ $mode == interactive ]]; then
899	source "$rootdir/scripts/common/setup/interactive.sh"
900	main_menu "$2" || exit 0
901fi
902
903if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
904	# Note that this will wait only for the first block device attached to
905	# a given storage controller. For nvme this may miss some of the devs
906	# in case multiple namespaces are being in place.
907	# FIXME: Wait for nvme controller(s) to be in live state and determine
908	# number of configured namespaces, build list of potential block devs
909	# and pass them to sync_dev_uevents. Is it worth the effort?
910	bdfs_to_wait_for=()
911	for bdf in "${!all_devices_d[@]}"; do
912		((all_devices_d["$bdf"] == 0)) || continue
913		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
914			[[ $(collect_driver "$bdf") != "${pci_bus_driver["$bdf"]}" ]] || continue
915			bdfs_to_wait_for+=("$bdf")
916		fi
917	done
918	if ((${#bdfs_to_wait_for[@]} > 0)); then
919		echo "Waiting for block devices as requested"
920		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
921		"$rootdir/scripts/sync_dev_uevents.sh" \
922			block/disk \
923			"${bdfs_to_wait_for[@]}" &
924		sync_pid=$!
925	fi
926fi
927
928if [[ $os == Linux ]]; then
929	if [ "$mode" == "config" ]; then
930		configure_linux
931	elif [ "$mode" == "cleanup" ]; then
932		cleanup_linux
933		clear_hugepages
934	elif [ "$mode" == "reset" ]; then
935		reset_linux
936	elif [ "$mode" == "status" ]; then
937		status_linux
938	elif [ "$mode" == "help" ]; then
939		usage $0
940	else
941		usage $0 "Invalid argument '$mode'"
942	fi
943else
944	if [ "$mode" == "config" ]; then
945		configure_freebsd
946	elif [ "$mode" == "reset" ]; then
947		reset_freebsd
948	elif [ "$mode" == "cleanup" ]; then
949		echo "setup.sh cleanup function not yet supported on $os"
950	elif [ "$mode" == "status" ]; then
951		status_freebsd
952	elif [ "$mode" == "help" ]; then
953		usage $0
954	else
955		usage $0 "Invalid argument '$mode'"
956	fi
957fi
958
959if [[ -e /proc/$sync_pid/status ]]; then
960	wait "$sync_pid"
961fi
962
963kmsg "spdk: $0 $* (done)"
964