xref: /spdk/scripts/setup.sh (revision 99a43e75ed9ac3c87d23e3746173cf5a5a992544)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|help]"
22	else
23		options="[config|reset|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "help              Print this help message."
48	echo
49	echo "The following environment variables can be specified."
50	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
51	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
52	echo "                  default."
53	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
54	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
55	echo "                  Uses kernel's default for hugepages size."
56	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
57	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
58	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
59	echo "                  Hugepages can be defined per node with e.g.:"
60	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
61	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
62	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
63	echo "                  setting is used."
64	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
65	echo "                  number of requested hugepages is lower from what's already"
66	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
67	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
68	echo "                  be made prior to allocation".
69	echo "PCI_ALLOWED"
70	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
71	echo "                  Each device must be specified as a full PCI address."
72	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
73	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
74	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
75	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
76	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
77	echo "                  will be bound."
78	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
79	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
80	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
81	echo "                  By default the current user will be used."
82	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
83	echo "                  bind devices to the given driver."
84	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
85	echo "PCI_BLOCK_SYNC_ON_RESET"
86	echo "                  If set in the environment, the attempt to wait for block devices associated"
87	echo "                  with given PCI device will be made upon reset"
88	exit 0
89}
90
91# In monolithic kernels the lsmod won't work. So
92# back that with a /sys/modules. We also check
93# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
94# contain needed info (like in Fedora-like OS).
95function check_for_driver() {
96	if [[ -z $1 ]]; then
97		return 0
98	fi
99
100	if lsmod | grep -q ${1//-/_}; then
101		return 1
102	fi
103
104	if [[ -d /sys/module/${1} || -d \
105		/sys/module/${1//-/_} || -d \
106		/sys/bus/pci/drivers/${1} || -d \
107		/sys/bus/pci/drivers/${1//-/_} ]]; then
108		return 2
109	fi
110	return 0
111}
112
113function check_for_driver_freebsd() {
114	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
115	local search_paths path driver
116	IFS=";" read -ra search_paths < <(kldconfig -rU)
117
118	for driver in contigmem.ko nic_uio.ko; do
119		for path in "${search_paths[@]}"; do
120			[[ -f $path/$driver ]] && continue 2
121		done
122		return 1
123	done
124	return 0
125}
126
127function pci_dev_echo() {
128	local bdf="$1"
129	shift
130	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
131}
132
133function linux_bind_driver() {
134	bdf="$1"
135	driver_name="$2"
136	old_driver_name=${drivers_d["$bdf"]:-no driver}
137	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
138
139	if [[ $driver_name == "$old_driver_name" ]]; then
140		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
141		return 0
142	fi
143
144	if [[ $old_driver_name != "no driver" ]]; then
145		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
146		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
147	fi
148
149	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
150
151	if [[ $driver_name == "none" ]]; then
152		return 0
153	fi
154
155	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
156	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
157
158	if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then
159		# Check if the uio_pci_generic driver is broken as it might be in
160		# some 4.18.x kernels (see centos8 for instance) - if our device
161		# didn't get a proper uio entry, fallback to igb_uio
162		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
163			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
164			drivers_d["$bdf"]="no driver"
165			# This call will override $driver_name for remaining devices as well
166			linux_bind_driver "$bdf" igb_uio
167		fi
168	fi
169
170	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
171	if [ -e "/dev/vfio/$iommu_group" ]; then
172		if [ -n "$TARGET_USER" ]; then
173			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
174		fi
175	fi
176}
177
178function linux_unbind_driver() {
179	local bdf="$1"
180	local ven_dev_id
181	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
182	local old_driver_name=${drivers_d["$bdf"]:-no driver}
183
184	if [[ $old_driver_name == "no driver" ]]; then
185		pci_dev_echo "$bdf" "Not bound to any driver"
186		return 0
187	fi
188
189	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
190		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
191		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
192	fi
193
194	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
195}
196
197function linux_hugetlbfs_mounts() {
198	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
199}
200
201function get_block_dev_from_bdf() {
202	local bdf=$1
203	local block
204
205	for block in /sys/block/*; do
206		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
207			echo "${block##*/}"
208		fi
209	done
210}
211
212function get_used_bdf_block_devs() {
213	local bdf=$1
214	local blocks block blockp dev mount holder
215	local used
216
217	hash lsblk &> /dev/null || return 1
218	blocks=($(get_block_dev_from_bdf "$bdf"))
219
220	for block in "${blocks[@]}"; do
221		# Check if the device is hold by some other, regardless if it's mounted
222		# or not.
223		for holder in "/sys/class/block/$block"*/holders/*; do
224			[[ -e $holder ]] || continue
225			blockp=${holder%/holders*} blockp=${blockp##*/}
226			if [[ -e $holder/slaves/$blockp ]]; then
227				used+=("holder@$blockp:${holder##*/}")
228			fi
229		done
230		while read -r dev mount; do
231			if [[ -e $mount ]]; then
232				used+=("mount@$block:$dev")
233			fi
234		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
235		if ((${#used[@]} == 0)); then
236			# Make sure we check if there's any valid data present on the target device
237			# regardless if it's being actively used or not. This is mainly done to make
238			# sure we don't miss more complex setups like ZFS pools, etc.
239			if block_in_use "$block" > /dev/null; then
240				used+=("data@$block")
241			fi
242		fi
243	done
244
245	if ((${#used[@]} > 0)); then
246		printf '%s\n' "${used[@]}"
247	fi
248}
249
250function collect_devices() {
251	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
252
253	local ids dev_type dev_id bdf bdfs in_use driver
254
255	ids+="PCI_DEVICE_ID_INTEL_IOAT"
256	ids+="|PCI_DEVICE_ID_INTEL_DSA"
257	ids+="|PCI_DEVICE_ID_INTEL_IAA"
258	ids+="|PCI_DEVICE_ID_VIRTIO"
259	ids+="|PCI_DEVICE_ID_INTEL_VMD"
260	ids+="|SPDK_PCI_CLASS_NVME"
261
262	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
263
264	while read -r _ dev_type dev_id; do
265		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
266		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
267		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
268		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
269		for bdf in "${bdfs[@]}"; do
270			in_use=0
271			if [[ $1 != status ]]; then
272				if ! pci_can_use "$bdf"; then
273					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
274					in_use=1
275				fi
276				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
277					if ! verify_bdf_block_devs "$bdf"; then
278						in_use=1
279					fi
280				fi
281				if [[ $dev_type == vmd ]]; then
282					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
283						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
284						in_use=1
285					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
286						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
287							if [ "$mode" == "config" ]; then
288								cat <<- MESSAGE
289									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
290									which are attached to the kernel NVMe driver,the binding process may go faster
291									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
292									NVMe SSDs, and then run again to unbind the VMD devices."
293								MESSAGE
294							fi
295						fi
296					fi
297				fi
298			fi
299			eval "${dev_type}_d[$bdf]=$in_use"
300			all_devices_d["$bdf"]=$in_use
301			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
302				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
303				drivers_d["$bdf"]=${driver##*/}
304			fi
305		done
306	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
307}
308
309function collect_driver() {
310	local bdf=$1
311	local drivers driver
312
313	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
314		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
315		# Pick first entry in case multiple aliases are bound to a driver.
316		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
317		driver=${driver##*/}
318	else
319		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
320		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
321		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
322		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
323		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
324		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
325	fi 2> /dev/null
326	echo "$driver"
327}
328
329function verify_bdf_block_devs() {
330	local bdf=$1
331	local blknames
332	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
333
334	if ((${#blknames[@]} > 0)); then
335		local IFS=","
336		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
337		return 1
338	fi
339}
340
341function configure_linux_pci() {
342	local driver_path=""
343	driver_name=""
344	igb_uio_fallback=""
345
346	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
347		# igb_uio is a common driver to override with and it depends on uio.
348		modprobe uio || true
349		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
350			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
351		fi
352	fi
353
354	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
355		driver_name=none
356	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
357		driver_path="$DRIVER_OVERRIDE"
358		driver_name="${DRIVER_OVERRIDE##*/}"
359		# modprobe and the sysfs don't use the .ko suffix.
360		driver_name=${driver_name%.ko}
361		# path = name -> there is no path
362		if [[ "$driver_path" = "$driver_name" ]]; then
363			driver_path=""
364		fi
365	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
366	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
367	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
368		driver_name=vfio-pci
369		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
370		# should be done automatically by modprobe since this particular module should
371		# be a part of vfio-pci dependencies, however, on some distros, it seems that
372		# it's not the case. See #1689.
373		if modinfo vfio_iommu_type1 > /dev/null; then
374			modprobe vfio_iommu_type1
375		fi
376	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
377		driver_name=uio_pci_generic
378	elif [[ -e $igb_uio_fallback ]]; then
379		driver_path="$igb_uio_fallback"
380		driver_name="igb_uio"
381		echo "WARNING: uio_pci_generic not detected - using $driver_name"
382	else
383		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
384		return 1
385	fi
386
387	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
388	if [[ $driver_name != "none" ]]; then
389		if [[ -n "$driver_path" ]]; then
390			insmod $driver_path || true
391		else
392			modprobe $driver_name
393		fi
394	fi
395
396	for bdf in "${!all_devices_d[@]}"; do
397		if ((all_devices_d["$bdf"] == 0)); then
398			if [[ -n ${nvme_d["$bdf"]} ]]; then
399				# Some nvme controllers may take significant amount of time while being
400				# unbound from the driver. Put that task into background to speed up the
401				# whole process. Currently this is done only for the devices bound to the
402				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
403				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
404				linux_bind_driver "$bdf" "$driver_name" &
405			else
406				linux_bind_driver "$bdf" "$driver_name"
407			fi
408		fi
409	done
410	wait
411
412	echo "1" > "/sys/bus/pci/rescan"
413}
414
415function cleanup_linux() {
416	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
417	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
418
419	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
420	if [[ -d $XDG_RUNTIME_DIR ]]; then
421		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
422	fi
423
424	for dir in "${dirs_to_clean[@]}"; do
425		files_to_clean+=("$dir/"*)
426	done
427	file_locks+=(/var/tmp/spdk_pci_lock*)
428	file_locks+=(/var/tmp/spdk_cpu_lock*)
429
430	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
431	files_to_clean+=("${file_locks[@]}")
432
433	# This may fail in case path that readlink attempts to resolve suddenly
434	# disappears (as it may happen with terminating processes).
435	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
436
437	if ((${#opened_files[@]} == 0)); then
438		echo "Can't get list of opened files!"
439		exit 1
440	fi
441
442	echo 'Cleaning'
443	for f in "${files_to_clean[@]}"; do
444		[[ -e $f ]] || continue
445		if [[ ${opened_files[*]} != *"$f"* ]]; then
446			echo "Removing:    $f"
447			rm $f
448		else
449			echo "Still open: $f"
450		fi
451	done
452
453	for dir in "${dirs_to_clean[@]}"; do
454		[[ -d $dir ]] || continue
455		if [[ ${opened_files[*]} != *"$dir"* ]]; then
456			echo "Removing:    $dir"
457			rmdir $dir
458		else
459			echo "Still open: $dir"
460		fi
461	done
462	echo "Clean"
463}
464
465check_hugepages_alloc() {
466	local hp_int=$1
467	local allocated_hugepages
468
469	allocated_hugepages=$(< "$hp_int")
470
471	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
472		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
473		return 0
474	fi
475
476	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
477
478	allocated_hugepages=$(< "$hp_int")
479	if ((allocated_hugepages < NRHUGE)); then
480		cat <<- ERROR
481
482			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
483			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
484		ERROR
485		return 1
486	fi
487}
488
489clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
490
491configure_linux_hugepages() {
492	local node system_nodes
493	local nodes_to_use nodes_hp
494
495	if [[ $CLEAR_HUGE == yes ]]; then
496		clear_hugepages
497	fi
498
499	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
500		clear_hugepages
501		check_hugepages_alloc /proc/sys/vm/nr_hugepages
502		return 0
503	fi
504
505	for node in /sys/devices/system/node/node*; do
506		[[ -e $node ]] || continue
507		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
508	done
509
510	if ((${#nodes[@]} == 0)); then
511		# No NUMA support? Fallback to common interface
512		check_hugepages_alloc /proc/sys/vm/nr_hugepages
513		return 0
514	fi
515
516	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
517	if ((${#nodes_to_use[@]} == 0)); then
518		nodes_to_use[0]=0
519	fi
520
521	# Align indexes with node ids
522	for node in "${!nodes_to_use[@]}"; do
523		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
524			eval "${nodes_to_use[node]}"
525		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
526			nodes_hp[nodes_to_use[node]]=$NRHUGE
527		fi
528	done
529
530	for node in "${!nodes_hp[@]}"; do
531		if [[ -z ${nodes[node]} ]]; then
532			echo "Node $node doesn't exist, ignoring" >&2
533			continue
534		fi
535		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
536	done
537}
538
539function configure_linux() {
540	configure_linux_pci
541	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
542
543	if [ -z "$hugetlbfs_mounts" ]; then
544		hugetlbfs_mounts=/mnt/huge
545		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
546		mkdir -p "$hugetlbfs_mounts"
547		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
548	fi
549
550	configure_linux_hugepages
551
552	if [ "$driver_name" = "vfio-pci" ]; then
553		if [ -n "$TARGET_USER" ]; then
554			for mount in $hugetlbfs_mounts; do
555				chown "$TARGET_USER" "$mount"
556				chmod g+w "$mount"
557			done
558
559			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
560			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
561				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
562				cat <<- MEMLOCK
563					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
564
565					This is the maximum amount of memory you will be
566					able to use with DPDK and VFIO if run as user "$TARGET_USER".
567					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
568				MEMLOCK
569				if ((MEMLOCK_AMNT < 65536)); then
570					echo ""
571					echo "## WARNING: memlock limit is less than 64MB"
572					echo -n "## DPDK with VFIO may not be able to initialize "
573					echo "if run as user \"$TARGET_USER\"."
574				fi
575			fi
576		fi
577	fi
578
579	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
580		# Some distros build msr as a module.  Make sure it's loaded to ensure
581		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
582		#  sleeps.
583		modprobe msr &> /dev/null || true
584	fi
585}
586
587function reset_linux_pci() {
588	# virtio
589	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
590	# Requires some more investigation - for example, some kernels do not seem to have
591	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
592	#  underscore vs. dash right in the virtio_scsi name.
593	modprobe virtio-pci || true
594	for bdf in "${!all_devices_d[@]}"; do
595		((all_devices_d["$bdf"] == 0)) || continue
596
597		driver=$(collect_driver "$bdf")
598		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
599			linux_bind_driver "$bdf" "$driver"
600		else
601			linux_unbind_driver "$bdf"
602		fi
603	done
604
605	echo "1" > "/sys/bus/pci/rescan"
606}
607
608function reset_linux() {
609	reset_linux_pci
610	for mount in $(linux_hugetlbfs_mounts); do
611		for hp in "$mount"/spdk*map_*; do
612			flock -n "$hp" true && rm -f "$hp"
613		done
614	done
615	rm -f /run/.spdk*
616}
617
618function status_linux() {
619	echo "Hugepages" >&2
620	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
621
622	numa_nodes=0
623	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
624		numa_nodes=$((numa_nodes + 1))
625		free_pages=$(cat $path/free_hugepages)
626		all_pages=$(cat $path/nr_hugepages)
627
628		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
629
630		node=${BASH_REMATCH[1]}
631		huge_size=${BASH_REMATCH[2]}
632
633		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
634	done
635
636	# fall back to system-wide hugepages
637	if [ "$numa_nodes" = "0" ]; then
638		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
639		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
640		node="-"
641		huge_size="$HUGEPGSZ"
642
643		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
644	fi
645
646	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
647		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
648
649	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
650
651	for bdf in "${sorted_bdfs[@]}"; do
652		driver=${drivers_d["$bdf"]}
653		if [ "$numa_nodes" = "0" ]; then
654			node="-"
655		else
656			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
657			if ((node == -1)); then
658				node=unknown
659			fi
660		fi
661		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
662			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
663		else
664			name="-"
665		fi
666
667		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
668			blknames=($(get_block_dev_from_bdf "$bdf"))
669		else
670			blknames=("-")
671		fi
672
673		desc=""
674		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
675		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
676		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
677		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
678		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
679		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
680
681		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
682			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
683			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
684	done
685}
686
687function status_freebsd() {
688	local pci
689
690	status_print() (
691		local type=$1
692		local dev driver
693
694		shift
695
696		for pci; do
697			printf '%-8s %-15s %-6s %-6s %-16s\n' \
698				"$type" \
699				"$pci" \
700				"${pci_ids_vendor["$pci"]}" \
701				"${pci_ids_device["$pci"]}" \
702				"${pci_bus_driver["$pci"]}"
703		done | sort -k2,2
704	)
705
706	local contigmem=present
707	local contigmem_buffer_size
708	local contigmem_num_buffers
709
710	if ! kldstat -q -m contigmem; then
711		contigmem="not present"
712	fi
713	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
714		contigmem_buffer_size="not set"
715	fi
716	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
717		contigmem_num_buffers="not set"
718	fi
719
720	cat <<- BSD_INFO
721		Contigmem ($contigmem)
722		Buffer Size: $contigmem_buffer_size
723		Num Buffers: $contigmem_num_buffers
724
725	BSD_INFO
726
727	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
728		"Type" "BDF" "Vendor" "Device" "Driver" >&2
729
730	status_print "NVMe" "${!nvme_d[@]}"
731	status_print "I/OAT" "${!ioat_d[@]}"
732	status_print "DSA" "${!dsa_d[@]}"
733	status_print "IAA" "${!iaa_d[@]}"
734	status_print "VMD" "${!vmd_d[@]}"
735}
736
737function configure_freebsd_pci() {
738	local BDFS
739
740	BDFS+=("${!nvme_d[@]}")
741	BDFS+=("${!ioat_d[@]}")
742	BDFS+=("${!dsa_d[@]}")
743	BDFS+=("${!iaa_d[@]}")
744	BDFS+=("${!vmd_d[@]}")
745
746	# Drop the domain part from all the addresses
747	BDFS=("${BDFS[@]#*:}")
748
749	local IFS=","
750	kldunload nic_uio.ko || true
751	kenv hw.nic_uio.bdfs="${BDFS[*]}"
752	kldload nic_uio.ko
753}
754
755function configure_freebsd() {
756	if ! check_for_driver_freebsd; then
757		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
758		return 1
759	fi
760	configure_freebsd_pci
761	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
762	#  previous value, unload contigmem so that we can reload with the new value.
763	if kldstat -q -m contigmem; then
764		# contigmem may be loaded, but the kernel environment doesn't have to
765		# be necessarily set at this point. If it isn't, kenv will fail to
766		# pick up the hw. options. Handle it.
767		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
768			contigmem_num_buffers=-1
769		fi 2> /dev/null
770		if ((contigmem_num_buffers != HUGEMEM / 256)); then
771			kldunload contigmem.ko
772		fi
773	fi
774	if ! kldstat -q -m contigmem; then
775		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
776		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
777		kldload contigmem.ko
778	fi
779}
780
781function reset_freebsd() {
782	kldunload contigmem.ko || true
783	kldunload nic_uio.ko || true
784}
785
786CMD=reset cache_pci_bus
787
788mode=$1
789
790if [ -z "$mode" ]; then
791	mode="config"
792fi
793
794: ${HUGEMEM:=2048}
795: ${PCI_ALLOWED:=""}
796: ${PCI_BLOCKED:=""}
797
798if [ -n "$NVME_ALLOWED" ]; then
799	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
800fi
801
802if [ -n "$SKIP_PCI" ]; then
803	PCI_ALLOWED="none"
804fi
805
806if [ -z "$TARGET_USER" ]; then
807	TARGET_USER="$SUDO_USER"
808	if [ -z "$TARGET_USER" ]; then
809		TARGET_USER=$(logname 2> /dev/null) || true
810	fi
811fi
812
813collect_devices "$mode"
814
815if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
816	# Note that this will wait only for the first block device attached to
817	# a given storage controller. For nvme this may miss some of the devs
818	# in case multiple namespaces are being in place.
819	# FIXME: Wait for nvme controller(s) to be in live state and determine
820	# number of configured namespaces, build list of potential block devs
821	# and pass them to sync_dev_uevents. Is it worth the effort?
822	bdfs_to_wait_for=()
823	for bdf in "${!all_devices_d[@]}"; do
824		((all_devices_d["$bdf"] == 0)) || continue
825		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
826			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
827			bdfs_to_wait_for+=("$bdf")
828		fi
829	done
830	if ((${#bdfs_to_wait_for[@]} > 0)); then
831		echo "Waiting for block devices as requested"
832		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
833		"$rootdir/scripts/sync_dev_uevents.sh" \
834			block/disk \
835			"${bdfs_to_wait_for[@]}" &
836		sync_pid=$!
837	fi
838fi
839
840if [[ $os == Linux ]]; then
841	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
842		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
843		unset -v HUGEPGSZ
844	fi
845
846	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
847	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
848	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
849
850	if [ "$mode" == "config" ]; then
851		configure_linux
852	elif [ "$mode" == "cleanup" ]; then
853		cleanup_linux
854		clear_hugepages
855	elif [ "$mode" == "reset" ]; then
856		reset_linux
857	elif [ "$mode" == "status" ]; then
858		status_linux
859	elif [ "$mode" == "help" ]; then
860		usage $0
861	else
862		usage $0 "Invalid argument '$mode'"
863	fi
864else
865	if [ "$mode" == "config" ]; then
866		configure_freebsd
867	elif [ "$mode" == "reset" ]; then
868		reset_freebsd
869	elif [ "$mode" == "cleanup" ]; then
870		echo "setup.sh cleanup function not yet supported on $os"
871	elif [ "$mode" == "status" ]; then
872		status_freebsd
873	elif [ "$mode" == "help" ]; then
874		usage $0
875	else
876		usage $0 "Invalid argument '$mode'"
877	fi
878fi
879
880if [[ -e /proc/$sync_pid/status ]]; then
881	wait "$sync_pid"
882fi
883