xref: /spdk/scripts/setup.sh (revision 7ff7ec0ed88d5acad07010b6c577326debc22f7c)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|interactive|help]"
22	else
23		options="[config|reset|interactive|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "interactive       Executes script in interactive mode."
48	echo "help              Print this help message."
49	echo
50	echo "The following environment variables can be specified."
51	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
52	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
53	echo "                  default."
54	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
55	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
56	echo "                  Uses kernel's default for hugepages size."
57	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
58	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
59	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
60	echo "                  Hugepages can be defined per node with e.g.:"
61	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
62	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
63	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
64	echo "                  setting is used."
65	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
66	echo "                  number of requested hugepages is lower from what's already"
67	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
68	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
69	echo "                  be made prior to allocation".
70	echo "PCI_ALLOWED"
71	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
72	echo "                  Each device must be specified as a full PCI address."
73	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
74	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
75	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
76	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
77	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
78	echo "                  will be bound."
79	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
80	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
81	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
82	echo "                  By default the current user will be used."
83	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
84	echo "                  bind devices to the given driver."
85	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
86	echo "PCI_BLOCK_SYNC_ON_RESET"
87	echo "                  If set in the environment, the attempt to wait for block devices associated"
88	echo "                  with given PCI device will be made upon reset"
89	echo "UNBIND_ENTIRE_IOMMU_GROUP"
90	echo "                  If set, all devices from nvme's iommu group will be unbound from their drivers."
91	echo "                  Use with caution."
92	echo "DEV_TYPE"
93	echo "                  Perform action only against selected type of devices. Supported:"
94	echo "                    IOAT|DSA|IAA|VIRTIO|VMD|NVME."
95	echo "                  Default is to select all types."
96	exit 0
97}
98
99# In monolithic kernels the lsmod won't work. So
100# back that with a /sys/modules. We also check
101# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
102# contain needed info (like in Fedora-like OS).
103function check_for_driver() {
104	if [[ -z $1 ]]; then
105		return 0
106	fi
107
108	if lsmod | grep -q ${1//-/_}; then
109		return 1
110	fi
111
112	if [[ -d /sys/module/${1} ||
113		-d /sys/module/${1//-/_} ||
114		-d /sys/bus/pci/drivers/${1} ||
115		-d /sys/bus/pci/drivers/${1//-/_} ]]; then
116		return 2
117	fi
118	return 0
119}
120
121function check_for_driver_freebsd() {
122	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
123	local search_paths path driver
124	IFS=";" read -ra search_paths < <(kldconfig -rU)
125
126	for driver in contigmem.ko nic_uio.ko; do
127		for path in "${search_paths[@]}"; do
128			[[ -f $path/$driver ]] && continue 2
129		done
130		return 1
131	done
132	return 0
133}
134
135function pci_dev_echo() {
136	local bdf="$1"
137	shift
138	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
139}
140
141function probe_driver() {
142	local bdf=$1
143	local driver_name=$2
144	old_driver_name=${pci_bus_driver["$bdf"]:-no driver}
145
146	if [[ $driver_name == "$old_driver_name" ]]; then
147		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
148		return 0
149	fi
150
151	if [[ $old_driver_name != "no driver" ]]; then
152		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
153	fi
154
155	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
156
157	if [[ $driver_name == "none" ]]; then
158		return 0
159	fi
160
161	local probe_attempts=0
162	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
163	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
164		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
165		sleep 0.5
166	done 2> /dev/null
167
168	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
169
170	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
171		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
172		return 1
173	fi
174}
175
176function linux_bind_driver() {
177	local bdf="$1"
178	local driver_name="$2"
179
180	probe_driver "$bdf" "$driver_name"
181
182	local iommu_group=${pci_iommu_groups["$bdf"]}
183	if [ -e "/dev/vfio/$iommu_group" ]; then
184		if [ -n "$TARGET_USER" ]; then
185			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
186		fi
187	fi
188
189	local iommug=("${!iommu_groups[iommu_group]}")
190	local _bdf _driver
191	if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then
192		pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!"
193		for _bdf in "${iommug[@]}"; do
194			[[ $_bdf == "$bdf" ]] && continue
195			_driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/}
196			if [[ $_driver == "$driver_name" ]]; then
197				continue
198			fi
199			# See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device()
200			pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver:-no driver})"
201			pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound"
202			if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then
203				pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}"
204				pci_bus_driver["${_bdf##*/}"]=$_driver
205				probe_driver "${_bdf##*/}" none
206			fi
207		done
208	fi
209
210}
211
212function linux_unbind_driver() {
213	local bdf="$1"
214	local old_driver_name=${pci_bus_driver["$bdf"]:-no driver}
215
216	if [[ $old_driver_name == "no driver" ]]; then
217		pci_dev_echo "$bdf" "Not bound to any driver"
218		return 0
219	fi
220
221	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
222		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
223		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
224	fi
225
226	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
227}
228
229function linux_hugetlbfs_mounts() {
230	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
231}
232
233function get_block_dev_from_bdf() {
234	local bdf=$1
235	local block blocks=() ctrl sub
236
237	for block in /sys/block/!(nvme*); do
238		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
239			blocks+=("${block##*/}")
240		fi
241	done
242
243	blocks+=($(get_block_dev_from_nvme "$bdf"))
244
245	printf '%s\n' "${blocks[@]}"
246}
247
248function get_block_dev_from_nvme() {
249	local bdf=$1 block ctrl sub
250
251	for ctrl in /sys/class/nvme/nvme*; do
252		[[ -e $ctrl/address && $(< "$ctrl/address") == "$bdf" ]] || continue
253		sub=$(< "$ctrl/subsysnqn") && break
254	done
255
256	[[ -n $sub ]] || return 0
257
258	for block in /sys/block/nvme*; do
259		[[ -e $block/hidden && $(< "$block/hidden") == 1 ]] && continue
260		[[ $(< "$block/device/subsysnqn") == "$sub" ]] && echo "${block##*/}"
261	done
262}
263
264function get_used_bdf_block_devs() {
265	local bdf=$1
266	local blocks block blockp dev mount holder
267	local used
268
269	hash lsblk &> /dev/null || return 1
270	blocks=($(get_block_dev_from_bdf "$bdf"))
271
272	for block in "${blocks[@]}"; do
273		# Check if the device is hold by some other, regardless if it's mounted
274		# or not.
275		for holder in "/sys/class/block/$block"*/holders/*; do
276			[[ -e $holder ]] || continue
277			blockp=${holder%/holders*} blockp=${blockp##*/}
278			if [[ -e $holder/slaves/$blockp ]]; then
279				used+=("holder@$blockp:${holder##*/}")
280			fi
281		done
282		while read -r dev mount; do
283			if [[ -e $mount ]]; then
284				used+=("mount@$block:$dev")
285			fi
286		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
287		if ((${#used[@]} == 0)); then
288			# Make sure we check if there's any valid data present on the target device
289			# regardless if it's being actively used or not. This is mainly done to make
290			# sure we don't miss more complex setups like ZFS pools, etc.
291			if block_in_use "$block" > /dev/null; then
292				used+=("data@$block")
293			fi
294		fi
295	done
296
297	if ((${#used[@]} > 0)); then
298		printf '%s\n' "${used[@]}"
299	fi
300}
301
302function collect_devices() {
303	local mode=$1 in_use
304
305	map_supported_devices "$DEV_TYPE"
306
307	for bdf in "${!all_devices_d[@]}"; do
308		in_use=0
309		if [[ $mode != status ]]; then
310			if ! pci_can_use "$bdf"; then
311				pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
312				in_use=1
313			fi
314		fi
315		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
316			if ! verify_bdf_block_devs "$bdf"; then
317				in_use=1
318			fi
319		fi
320		if [[ -n ${vmd_d["$bdf"]} ]]; then
321			if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
322				pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
323				in_use=1
324			elif ((vmd_nvme_count["$bdf"] > 0)) && [[ $DRIVER_OVERRLDE != none && $mode == config ]]; then
325				cat <<- MESSAGE
326					Binding new driver to VMD device with NVMe SSDs attached to the kernel:
327					  ${!vmd_nvme_d["$bdf"]}
328					The binding process may go faster if you first run this script with
329					DRIVER_OVERRIDE="none" to unbind only the NVMe SSDs, and then run
330					again to unbind the VMD devices.
331				MESSAGE
332			fi
333		fi
334		if [[ -n ${dsa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then
335			pci_dev_echo "$bdf" "Skipping not allowed DSA controller at $bdf"
336			in_use=1
337		fi
338		if [[ -n ${iaa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then
339			pci_dev_echo "$bdf" "Skipping not allowed IAA controller at $bdf"
340			in_use=1
341		fi
342		# Update in-use for each bdf. Default from the map_supported_devices() is 0 == "not used"
343		local -n type_ref=${all_devices_type_d["$bdf"]}_d
344		type_ref["$bdf"]=$in_use
345		all_devices_d["$bdf"]=$in_use
346	done
347
348	# Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are
349	# any skip them since they won't be usable by SPDK without moving the entire VMD ctrl
350	# away from the kernel first. That said, allow to touch the nvmes in case user requested
351	# all devices to be unbound from any driver or if dedicated override flag was set.
352	[[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0
353
354	for bdf in "${!nvme_d[@]}"; do
355		is_nvme_iommu_shared_with_vmd "$bdf" || continue
356		nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1
357		pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})"
358	done
359
360	return 0
361}
362
363function collect_driver() {
364	local bdf=$1
365	local drivers driver
366
367	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
368		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
369		# Pick first entry in case multiple aliases are bound to a driver.
370		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
371		driver=${driver##*/}
372	else
373		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
374		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
375		[[ -n ${dsa_d["$bdf"]} ]] && driver=idxd
376		[[ -n ${iaa_d["$bdf"]} ]] && driver=idxd
377		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
378		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
379	fi 2> /dev/null
380	echo "$driver"
381}
382
383function verify_bdf_block_devs() {
384	local bdf=$1
385	local blknames
386	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
387
388	if ((${#blknames[@]} > 0)); then
389		local IFS=","
390		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
391		return 1
392	fi
393}
394
395function configure_linux_pci() {
396	local driver_path=""
397	driver_name=""
398	igb_uio_fallback=""
399
400	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
401		# igb_uio is a common driver to override with and it depends on uio.
402		modprobe uio || true
403		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
404			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
405		fi
406	fi
407
408	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
409		driver_name=none
410	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
411		driver_path="$DRIVER_OVERRIDE"
412		driver_name="${DRIVER_OVERRIDE##*/}"
413		# modprobe and the sysfs don't use the .ko suffix.
414		driver_name=${driver_name%.ko}
415		# path = name -> there is no path
416		if [[ "$driver_path" = "$driver_name" ]]; then
417			driver_path=""
418		fi
419	elif is_iommu_enabled; then
420		driver_name=vfio-pci
421		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
422		# should be done automatically by modprobe since this particular module should
423		# be a part of vfio-pci dependencies, however, on some distros, it seems that
424		# it's not the case. See #1689.
425		if modinfo vfio_iommu_type1 > /dev/null; then
426			modprobe vfio_iommu_type1
427		fi
428	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
429		driver_name=uio_pci_generic
430	elif [[ -e $igb_uio_fallback ]]; then
431		driver_path="$igb_uio_fallback"
432		driver_name="igb_uio"
433		echo "WARNING: uio_pci_generic not detected - using $driver_name"
434	else
435		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
436		return 1
437	fi
438
439	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
440	if [[ $driver_name != "none" ]]; then
441		if [[ -n "$driver_path" ]]; then
442			insmod $driver_path || true
443		else
444			modprobe $driver_name
445		fi
446	fi
447
448	for bdf in "${!all_devices_d[@]}"; do
449		if ((all_devices_d["$bdf"] == 0)); then
450			if [[ -n ${nvme_d["$bdf"]} ]]; then
451				# Some nvme controllers may take significant amount of time while being
452				# unbound from the driver. Put that task into background to speed up the
453				# whole process. Currently this is done only for the devices bound to the
454				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
455				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
456				linux_bind_driver "$bdf" "$driver_name" &
457			else
458				linux_bind_driver "$bdf" "$driver_name"
459			fi
460		fi
461	done
462	wait
463
464	echo "1" > "/sys/bus/pci/rescan"
465}
466
467function cleanup_linux() {
468	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
469	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
470
471	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
472	if [[ -d $XDG_RUNTIME_DIR ]]; then
473		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
474	fi
475
476	for dir in "${dirs_to_clean[@]}"; do
477		files_to_clean+=("$dir/"*)
478	done
479	file_locks+=(/var/tmp/spdk_pci_lock*)
480	file_locks+=(/var/tmp/spdk_cpu_lock*)
481
482	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
483	files_to_clean+=("${file_locks[@]}")
484
485	# This may fail in case path that readlink attempts to resolve suddenly
486	# disappears (as it may happen with terminating processes).
487	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
488
489	if ((${#opened_files[@]} == 0)); then
490		echo "Can't get list of opened files!"
491		exit 1
492	fi
493
494	echo 'Cleaning'
495	for f in "${files_to_clean[@]}"; do
496		[[ -e $f ]] || continue
497		if [[ ${opened_files[*]} != *"$f"* ]]; then
498			echo "Removing:    $f"
499			rm $f
500		else
501			echo "Still open: $f"
502		fi
503	done
504
505	for dir in "${dirs_to_clean[@]}"; do
506		[[ -d $dir ]] || continue
507		if [[ ${opened_files[*]} != *"$dir"* ]]; then
508			echo "Removing:    $dir"
509			rmdir $dir
510		else
511			echo "Still open: $dir"
512		fi
513	done
514	echo "Clean"
515}
516
517check_hugepages_alloc() {
518	local hp_int=$1
519	local allocated_hugepages
520
521	allocated_hugepages=$(< "$hp_int")
522
523	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
524		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
525		return 0
526	fi
527
528	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
529
530	allocated_hugepages=$(< "$hp_int")
531	if ((allocated_hugepages < NRHUGE)); then
532		cat <<- ERROR
533
534			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
535			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
536		ERROR
537		return 1
538	fi
539}
540
541clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
542
543configure_linux_hugepages() {
544	local node system_nodes
545	local nodes_to_use nodes_hp
546
547	if [[ $CLEAR_HUGE == yes ]]; then
548		clear_hugepages
549	fi
550
551	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
552		clear_hugepages
553		check_hugepages_alloc /proc/sys/vm/nr_hugepages
554		return 0
555	fi
556
557	for node in /sys/devices/system/node/node*; do
558		[[ -e $node ]] || continue
559		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
560	done
561
562	if ((${#nodes[@]} == 0)); then
563		# No NUMA support? Fallback to common interface
564		check_hugepages_alloc /proc/sys/vm/nr_hugepages
565		return 0
566	fi
567
568	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
569	if ((${#nodes_to_use[@]} == 0)); then
570		nodes_to_use[0]=0
571	fi
572
573	# Align indexes with node ids
574	for node in "${!nodes_to_use[@]}"; do
575		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
576			eval "${nodes_to_use[node]}"
577		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
578			nodes_hp[nodes_to_use[node]]=$NRHUGE
579		fi
580	done
581
582	for node in "${!nodes_hp[@]}"; do
583		if [[ -z ${nodes[node]} ]]; then
584			echo "Node $node doesn't exist, ignoring" >&2
585			continue
586		fi
587		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
588	done
589}
590
591function configure_linux() {
592	configure_linux_pci
593	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
594
595	if [ -z "$hugetlbfs_mounts" ]; then
596		hugetlbfs_mounts=/mnt/huge
597		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
598		mkdir -p "$hugetlbfs_mounts"
599		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
600	fi
601
602	configure_linux_hugepages
603
604	if [ "$driver_name" = "vfio-pci" ]; then
605		if [ -n "$TARGET_USER" ]; then
606			for mount in $hugetlbfs_mounts; do
607				chown "$TARGET_USER" "$mount"
608				chmod g+w "$mount"
609			done
610
611			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
612			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
613				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
614				cat <<- MEMLOCK
615					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
616
617					This is the maximum amount of memory you will be
618					able to use with DPDK and VFIO if run as user "$TARGET_USER".
619					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
620				MEMLOCK
621				if ((MEMLOCK_AMNT < 65536)); then
622					echo ""
623					echo "## WARNING: memlock limit is less than 64MB"
624					echo -n "## DPDK with VFIO may not be able to initialize "
625					echo "if run as user \"$TARGET_USER\"."
626				fi
627			fi
628		fi
629	fi
630
631	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
632		# Some distros build msr as a module.  Make sure it's loaded to ensure
633		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
634		#  sleeps.
635		modprobe msr &> /dev/null || true
636	fi
637}
638
639function reset_linux_pci() {
640	# virtio
641	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
642	# Requires some more investigation - for example, some kernels do not seem to have
643	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
644	#  underscore vs. dash right in the virtio_scsi name.
645	modprobe virtio-pci || true
646	for bdf in "${!all_devices_d[@]}"; do
647		((all_devices_d["$bdf"] == 0)) || continue
648
649		driver=$(collect_driver "$bdf")
650		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
651			linux_bind_driver "$bdf" "$driver"
652		else
653			linux_unbind_driver "$bdf"
654		fi
655	done
656
657	echo "1" > "/sys/bus/pci/rescan"
658}
659
660function reset_linux() {
661	reset_linux_pci
662	for mount in $(linux_hugetlbfs_mounts); do
663		for hp in "$mount"/spdk*map_*; do
664			flock -n "$hp" true && rm -f "$hp"
665		done
666	done
667	rm -f /run/.spdk*
668}
669
670function status_linux() {
671	echo "Hugepages" >&2
672	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
673
674	numa_nodes=0
675	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
676		numa_nodes=$((numa_nodes + 1))
677		free_pages=$(cat $path/free_hugepages)
678		all_pages=$(cat $path/nr_hugepages)
679
680		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
681
682		node=${BASH_REMATCH[1]}
683		huge_size=${BASH_REMATCH[2]}
684
685		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
686	done
687
688	# fall back to system-wide hugepages
689	if [ "$numa_nodes" = "0" ]; then
690		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
691		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
692		node="-"
693		huge_size="$HUGEPGSZ"
694
695		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
696	fi
697
698	printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
699		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
700
701	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
702
703	for bdf in "${sorted_bdfs[@]}"; do
704		driver=${pci_bus_driver["$bdf"]}
705		if [ "$numa_nodes" = "0" ]; then
706			node="-"
707		else
708			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
709			if ((node == -1)); then
710				node=unknown
711			fi
712		fi
713		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
714			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
715		else
716			name="-"
717		fi
718
719		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
720			blknames=($(get_block_dev_from_bdf "$bdf"))
721		else
722			blknames=("-")
723		fi
724
725		desc=""
726		desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}}
727		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
728		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
729		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
730		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
731		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
732
733		printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
734			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
735			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
736	done
737}
738
739function status_freebsd() {
740	local pci
741
742	status_print() (
743		local type=$1
744		local dev driver
745
746		shift
747
748		for pci; do
749			printf '%-8s %-15s %-6s %-6s %-16s\n' \
750				"$type" \
751				"$pci" \
752				"${pci_ids_vendor["$pci"]}" \
753				"${pci_ids_device["$pci"]}" \
754				"${pci_bus_driver["$pci"]}"
755		done | sort -k2,2
756	)
757
758	local contigmem=present
759	local contigmem_buffer_size
760	local contigmem_num_buffers
761
762	if ! kldstat -q -m contigmem; then
763		contigmem="not present"
764	fi
765	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
766		contigmem_buffer_size="not set"
767	fi
768	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
769		contigmem_num_buffers="not set"
770	fi
771
772	cat <<- BSD_INFO
773		Contigmem ($contigmem)
774		Buffer Size: $contigmem_buffer_size
775		Num Buffers: $contigmem_num_buffers
776
777	BSD_INFO
778
779	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
780		"Type" "BDF" "Vendor" "Device" "Driver" >&2
781
782	status_print "NVMe" "${!nvme_d[@]}"
783	status_print "I/OAT" "${!ioat_d[@]}"
784	status_print "DSA" "${!dsa_d[@]}"
785	status_print "IAA" "${!iaa_d[@]}"
786	status_print "VMD" "${!vmd_d[@]}"
787}
788
789function configure_freebsd_pci() {
790	local BDFS
791
792	BDFS+=("${!nvme_d[@]}")
793	BDFS+=("${!ioat_d[@]}")
794	BDFS+=("${!dsa_d[@]}")
795	BDFS+=("${!iaa_d[@]}")
796	BDFS+=("${!vmd_d[@]}")
797
798	# Drop the domain part from all the addresses
799	BDFS=("${BDFS[@]#*:}")
800
801	local IFS=","
802	kldunload nic_uio.ko || true
803	kenv hw.nic_uio.bdfs="${BDFS[*]}"
804	kldload nic_uio.ko
805}
806
807function configure_freebsd() {
808	if ! check_for_driver_freebsd; then
809		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
810		return 1
811	fi
812	configure_freebsd_pci
813	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
814	#  previous value, unload contigmem so that we can reload with the new value.
815	if kldstat -q -m contigmem; then
816		# contigmem may be loaded, but the kernel environment doesn't have to
817		# be necessarily set at this point. If it isn't, kenv will fail to
818		# pick up the hw. options. Handle it.
819		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
820			contigmem_num_buffers=-1
821		fi 2> /dev/null
822		if ((contigmem_num_buffers != HUGEMEM / 256)); then
823			kldunload contigmem.ko
824		fi
825	fi
826	if ! kldstat -q -m contigmem; then
827		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
828		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
829		kldload contigmem.ko
830	fi
831}
832
833function reset_freebsd() {
834	kldunload contigmem.ko || true
835	kldunload nic_uio.ko || true
836}
837
838function set_hp() {
839	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
840		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
841		unset -v HUGEPGSZ
842	fi
843
844	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
845	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
846	NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
847}
848
849kmsg "spdk: $0 $* (start)"
850
851CMD=reset cache_pci_bus
852
853mode=$1
854
855if [ -z "$mode" ]; then
856	mode="config"
857fi
858
859: ${HUGEMEM:=2048}
860: ${PCI_ALLOWED:=""}
861: ${PCI_BLOCKED:=""}
862
863if [ -n "$NVME_ALLOWED" ]; then
864	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
865fi
866
867if [ -n "$SKIP_PCI" ]; then
868	PCI_ALLOWED="none"
869fi
870
871if [ -z "$TARGET_USER" ]; then
872	TARGET_USER="$SUDO_USER"
873	if [ -z "$TARGET_USER" ]; then
874		TARGET_USER=$(logname 2> /dev/null) || true
875	fi
876fi
877
878collect_devices "$mode"
879
880if [[ $os == Linux ]]; then
881	set_hp
882fi
883
884if [[ $mode == interactive ]]; then
885	source "$rootdir/scripts/common/setup/interactive.sh"
886	main_menu "$2" || exit 0
887fi
888
889if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
890	# Note that this will wait only for the first block device attached to
891	# a given storage controller. For nvme this may miss some of the devs
892	# in case multiple namespaces are being in place.
893	# FIXME: Wait for nvme controller(s) to be in live state and determine
894	# number of configured namespaces, build list of potential block devs
895	# and pass them to sync_dev_uevents. Is it worth the effort?
896	bdfs_to_wait_for=()
897	for bdf in "${!all_devices_d[@]}"; do
898		((all_devices_d["$bdf"] == 0)) || continue
899		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
900			[[ $(collect_driver "$bdf") != "${pci_bus_driver["$bdf"]}" ]] || continue
901			bdfs_to_wait_for+=("$bdf")
902		fi
903	done
904	if ((${#bdfs_to_wait_for[@]} > 0)); then
905		echo "Waiting for block devices as requested"
906		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
907		"$rootdir/scripts/sync_dev_uevents.sh" \
908			block/disk \
909			"${bdfs_to_wait_for[@]}" &
910		sync_pid=$!
911	fi
912fi
913
914if [[ $os == Linux ]]; then
915	if [ "$mode" == "config" ]; then
916		configure_linux
917	elif [ "$mode" == "cleanup" ]; then
918		cleanup_linux
919		clear_hugepages
920	elif [ "$mode" == "reset" ]; then
921		reset_linux
922	elif [ "$mode" == "status" ]; then
923		status_linux
924	elif [ "$mode" == "help" ]; then
925		usage $0
926	else
927		usage $0 "Invalid argument '$mode'"
928	fi
929else
930	if [ "$mode" == "config" ]; then
931		configure_freebsd
932	elif [ "$mode" == "reset" ]; then
933		reset_freebsd
934	elif [ "$mode" == "cleanup" ]; then
935		echo "setup.sh cleanup function not yet supported on $os"
936	elif [ "$mode" == "status" ]; then
937		status_freebsd
938	elif [ "$mode" == "help" ]; then
939		usage $0
940	else
941		usage $0 "Invalid argument '$mode'"
942	fi
943fi
944
945if [[ -e /proc/$sync_pid/status ]]; then
946	wait "$sync_pid"
947fi
948
949kmsg "spdk: $0 $* (done)"
950