xref: /spdk/scripts/setup.sh (revision aef00d4420639a8e1abff899f43eda21992dec42)
1#!/usr/bin/env bash
2
3set -e
4shopt -s nullglob extglob
5
6os=$(uname -s)
7
8if [[ $os != Linux && $os != FreeBSD ]]; then
9	echo "Not supported platform ($os), aborting"
10	exit 1
11fi
12
13rootdir=$(readlink -f $(dirname $0))/..
14source "$rootdir/scripts/common.sh"
15
16function usage() {
17	if [[ $os == Linux ]]; then
18		options="[config|reset|status|cleanup|help]"
19	else
20		options="[config|reset|help]"
21	fi
22
23	[[ -n $2 ]] && (
24		echo "$2"
25		echo ""
26	)
27	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
28	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
29	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
30	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
31	echo "Usage: $(basename $1) $options"
32	echo
33	echo "$options - as following:"
34	echo "config            Default mode. Allocate hugepages and bind PCI devices."
35	if [[ $os == Linux ]]; then
36		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
37	fi
38	echo "reset             Rebind PCI devices back to their original drivers."
39	echo "                  Also cleanup any leftover spdk files/resources."
40	echo "                  Hugepage memory size will remain unchanged."
41	if [[ $os == Linux ]]; then
42		echo "status            Print status of all SPDK-compatible devices on the system."
43	fi
44	echo "help              Print this help message."
45	echo
46	echo "The following environment variables can be specified."
47	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
48	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
49	echo "                  default."
50	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
51	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
52	echo "                  Uses kernel's default for hugepages size."
53	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
54	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
55	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
56	echo "                  Hugepages can be defined per node with e.g.:"
57	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
58	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
59	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
60	echo "                  setting is used."
61	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
62	echo "                  number of requested hugepages is lower from what's already"
63	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
64	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
65	echo "                  be made prior to allocation".
66	echo "PCI_ALLOWED"
67	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
68	echo "                  Each device must be specified as a full PCI address."
69	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
70	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
71	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
72	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
73	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
74	echo "                  will be bound."
75	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
76	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
77	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
78	echo "                  By default the current user will be used."
79	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
80	echo "                  bind devices to the given driver."
81	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
82	echo "PCI_BLOCK_SYNC_ON_RESET"
83	echo "                  If set in the environment, the attempt to wait for block devices associated"
84	echo "                  with given PCI device will be made upon reset"
85	exit 0
86}
87
88# In monolithic kernels the lsmod won't work. So
89# back that with a /sys/modules. We also check
90# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
91# contain needed info (like in Fedora-like OS).
92function check_for_driver() {
93	if [[ -z $1 ]]; then
94		return 0
95	fi
96
97	if lsmod | grep -q ${1//-/_}; then
98		return 1
99	fi
100
101	if [[ -d /sys/module/${1} || -d \
102		/sys/module/${1//-/_} || -d \
103		/sys/bus/pci/drivers/${1} || -d \
104		/sys/bus/pci/drivers/${1//-/_} ]]; then
105		return 2
106	fi
107	return 0
108}
109
110function check_for_driver_freebsd() {
111	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
112	local search_paths path driver
113	IFS=";" read -ra search_paths < <(kldconfig -rU)
114
115	for driver in contigmem.ko nic_uio.ko; do
116		for path in "${search_paths[@]}"; do
117			[[ -f $path/$driver ]] && continue 2
118		done
119		return 1
120	done
121	return 0
122}
123
124function pci_dev_echo() {
125	local bdf="$1"
126	shift
127	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
128}
129
130function linux_bind_driver() {
131	bdf="$1"
132	driver_name="$2"
133	old_driver_name=${drivers_d["$bdf"]:-no driver}
134	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
135
136	if [[ $driver_name == "$old_driver_name" ]]; then
137		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
138		return 0
139	fi
140
141	if [[ $old_driver_name != "no driver" ]]; then
142		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
143		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
144	fi
145
146	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
147
148	if [[ $driver_name == "none" ]]; then
149		return 0
150	fi
151
152	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
153	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
154
155	if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then
156		# Check if the uio_pci_generic driver is broken as it might be in
157		# some 4.18.x kernels (see centos8 for instance) - if our device
158		# didn't get a proper uio entry, fallback to igb_uio
159		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
160			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
161			drivers_d["$bdf"]="no driver"
162			# This call will override $driver_name for remaining devices as well
163			linux_bind_driver "$bdf" igb_uio
164		fi
165	fi
166
167	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
168	if [ -e "/dev/vfio/$iommu_group" ]; then
169		if [ -n "$TARGET_USER" ]; then
170			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
171		fi
172	fi
173}
174
175function linux_unbind_driver() {
176	local bdf="$1"
177	local ven_dev_id
178	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
179	local old_driver_name=${drivers_d["$bdf"]:-no driver}
180
181	if [[ $old_driver_name == "no driver" ]]; then
182		pci_dev_echo "$bdf" "Not bound to any driver"
183		return 0
184	fi
185
186	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
187		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
188		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
189	fi
190
191	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
192}
193
194function linux_hugetlbfs_mounts() {
195	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
196}
197
198function get_block_dev_from_bdf() {
199	local bdf=$1
200	local block
201
202	for block in /sys/block/*; do
203		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
204			echo "${block##*/}"
205		fi
206	done
207}
208
209function get_used_bdf_block_devs() {
210	local bdf=$1
211	local blocks block blockp dev mount holder
212	local used
213
214	hash lsblk &> /dev/null || return 1
215	blocks=($(get_block_dev_from_bdf "$bdf"))
216
217	for block in "${blocks[@]}"; do
218		# Check if the device is hold by some other, regardless if it's mounted
219		# or not.
220		for holder in "/sys/class/block/$block"*/holders/*; do
221			[[ -e $holder ]] || continue
222			blockp=${holder%/holders*} blockp=${blockp##*/}
223			if [[ -e $holder/slaves/$blockp ]]; then
224				used+=("holder@$blockp:${holder##*/}")
225			fi
226		done
227		while read -r dev mount; do
228			if [[ -e $mount ]]; then
229				used+=("mount@$block:$dev")
230			fi
231		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
232		if ((${#used[@]} == 0)); then
233			# Make sure we check if there's any valid data present on the target device
234			# regardless if it's being actively used or not. This is mainly done to make
235			# sure we don't miss more complex setups like ZFS pools, etc.
236			if block_in_use "$block" > /dev/null; then
237				used+=("data@$block")
238			fi
239		fi
240	done
241
242	if ((${#used[@]} > 0)); then
243		printf '%s\n' "${used[@]}"
244	fi
245}
246
247function collect_devices() {
248	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
249
250	local ids dev_type dev_id bdf bdfs in_use driver
251
252	ids+="PCI_DEVICE_ID_INTEL_IOAT"
253	ids+="|PCI_DEVICE_ID_INTEL_DSA"
254	ids+="|PCI_DEVICE_ID_INTEL_IAA"
255	ids+="|PCI_DEVICE_ID_VIRTIO"
256	ids+="|PCI_DEVICE_ID_INTEL_VMD"
257	ids+="|SPDK_PCI_CLASS_NVME"
258
259	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
260
261	while read -r _ dev_type dev_id; do
262		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
263		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
264		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
265		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
266		for bdf in "${bdfs[@]}"; do
267			in_use=0
268			if [[ $1 != status ]]; then
269				if ! pci_can_use "$bdf"; then
270					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
271					in_use=1
272				fi
273				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
274					if ! verify_bdf_block_devs "$bdf"; then
275						in_use=1
276					fi
277				fi
278				if [[ $dev_type == vmd ]]; then
279					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
280						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
281						in_use=1
282					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
283						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
284							if [ "$mode" == "config" ]; then
285								cat <<- MESSAGE
286									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
287									which are attached to the kernel NVMe driver,the binding process may go faster
288									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
289									NVMe SSDs, and then run again to unbind the VMD devices."
290								MESSAGE
291							fi
292						fi
293					fi
294				fi
295			fi
296			eval "${dev_type}_d[$bdf]=$in_use"
297			all_devices_d["$bdf"]=$in_use
298			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
299				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
300				drivers_d["$bdf"]=${driver##*/}
301			fi
302		done
303	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
304}
305
306function collect_driver() {
307	local bdf=$1
308	local drivers driver
309
310	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
311		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
312		# Pick first entry in case multiple aliases are bound to a driver.
313		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
314		driver=${driver##*/}
315	else
316		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
317		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
318		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
319		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
320		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
321		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
322	fi 2> /dev/null
323	echo "$driver"
324}
325
326function verify_bdf_block_devs() {
327	local bdf=$1
328	local blknames
329	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
330
331	if ((${#blknames[@]} > 0)); then
332		local IFS=","
333		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
334		return 1
335	fi
336}
337
338function configure_linux_pci() {
339	local driver_path=""
340	driver_name=""
341	igb_uio_fallback=""
342
343	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
344		# igb_uio is a common driver to override with and it depends on uio.
345		modprobe uio || true
346		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
347			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
348		fi
349	fi
350
351	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
352		driver_name=none
353	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
354		driver_path="$DRIVER_OVERRIDE"
355		driver_name="${DRIVER_OVERRIDE##*/}"
356		# modprobe and the sysfs don't use the .ko suffix.
357		driver_name=${driver_name%.ko}
358		# path = name -> there is no path
359		if [[ "$driver_path" = "$driver_name" ]]; then
360			driver_path=""
361		fi
362	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
363	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
364	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
365		driver_name=vfio-pci
366		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
367		# should be done automatically by modprobe since this particular module should
368		# be a part of vfio-pci dependencies, however, on some distros, it seems that
369		# it's not the case. See #1689.
370		if modinfo vfio_iommu_type1 > /dev/null; then
371			modprobe vfio_iommu_type1
372		fi
373	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
374		driver_name=uio_pci_generic
375	elif [[ -e $igb_uio_fallback ]]; then
376		driver_path="$igb_uio_fallback"
377		driver_name="igb_uio"
378		echo "WARNING: uio_pci_generic not detected - using $driver_name"
379	else
380		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
381		return 1
382	fi
383
384	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
385	if [[ $driver_name != "none" ]]; then
386		if [[ -n "$driver_path" ]]; then
387			insmod $driver_path || true
388		else
389			modprobe $driver_name
390		fi
391	fi
392
393	for bdf in "${!all_devices_d[@]}"; do
394		if ((all_devices_d["$bdf"] == 0)); then
395			if [[ -n ${nvme_d["$bdf"]} ]]; then
396				# Some nvme controllers may take significant amount of time while being
397				# unbound from the driver. Put that task into background to speed up the
398				# whole process. Currently this is done only for the devices bound to the
399				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
400				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
401				linux_bind_driver "$bdf" "$driver_name" &
402			else
403				linux_bind_driver "$bdf" "$driver_name"
404			fi
405		fi
406	done
407	wait
408
409	echo "1" > "/sys/bus/pci/rescan"
410}
411
412function cleanup_linux() {
413	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
414	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
415
416	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
417	if [[ -d $XDG_RUNTIME_DIR ]]; then
418		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
419	fi
420
421	for dir in "${dirs_to_clean[@]}"; do
422		files_to_clean+=("$dir/"*)
423	done
424	file_locks+=(/var/tmp/spdk_pci_lock*)
425
426	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns))
427	files_to_clean+=("${file_locks[@]}")
428
429	# This may fail in case path that readlink attempts to resolve suddenly
430	# disappears (as it may happen with terminating processes).
431	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
432
433	if ((${#opened_files[@]} == 0)); then
434		echo "Can't get list of opened files!"
435		exit 1
436	fi
437
438	echo 'Cleaning'
439	for f in "${files_to_clean[@]}"; do
440		[[ -e $f ]] || continue
441		if [[ ${opened_files[*]} != *"$f"* ]]; then
442			echo "Removing:    $f"
443			rm $f
444		else
445			echo "Still open: $f"
446		fi
447	done
448
449	for dir in "${dirs_to_clean[@]}"; do
450		[[ -d $dir ]] || continue
451		if [[ ${opened_files[*]} != *"$dir"* ]]; then
452			echo "Removing:    $dir"
453			rmdir $dir
454		else
455			echo "Still open: $dir"
456		fi
457	done
458	echo "Clean"
459}
460
461check_hugepages_alloc() {
462	local hp_int=$1
463	local allocated_hugepages
464
465	allocated_hugepages=$(< "$hp_int")
466
467	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
468		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
469		return 0
470	fi
471
472	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
473
474	allocated_hugepages=$(< "$hp_int")
475	if ((allocated_hugepages < NRHUGE)); then
476		cat <<- ERROR
477
478			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
479			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
480		ERROR
481		return 1
482	fi
483}
484
485clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
486
487configure_linux_hugepages() {
488	local node system_nodes
489	local nodes_to_use nodes_hp
490
491	if [[ $CLEAR_HUGE == yes ]]; then
492		clear_hugepages
493	fi
494
495	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
496		clear_hugepages
497		check_hugepages_alloc /proc/sys/vm/nr_hugepages
498		return 0
499	fi
500
501	for node in /sys/devices/system/node/node*; do
502		[[ -e $node ]] || continue
503		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
504	done
505
506	if ((${#nodes[@]} == 0)); then
507		# No NUMA support? Fallback to common interface
508		check_hugepages_alloc /proc/sys/vm/nr_hugepages
509		return 0
510	fi
511
512	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
513	if ((${#nodes_to_use[@]} == 0)); then
514		nodes_to_use[0]=0
515	fi
516
517	# Align indexes with node ids
518	for node in "${!nodes_to_use[@]}"; do
519		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
520			eval "${nodes_to_use[node]}"
521		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
522			nodes_hp[nodes_to_use[node]]=$NRHUGE
523		fi
524	done
525
526	for node in "${!nodes_hp[@]}"; do
527		if [[ -z ${nodes[node]} ]]; then
528			echo "Node $node doesn't exist, ignoring" >&2
529			continue
530		fi
531		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
532	done
533}
534
535function configure_linux() {
536	configure_linux_pci
537	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
538
539	if [ -z "$hugetlbfs_mounts" ]; then
540		hugetlbfs_mounts=/mnt/huge
541		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
542		mkdir -p "$hugetlbfs_mounts"
543		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
544	fi
545
546	configure_linux_hugepages
547
548	if [ "$driver_name" = "vfio-pci" ]; then
549		if [ -n "$TARGET_USER" ]; then
550			for mount in $hugetlbfs_mounts; do
551				chown "$TARGET_USER" "$mount"
552				chmod g+w "$mount"
553			done
554
555			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
556			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
557				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
558				cat <<- MEMLOCK
559					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
560
561					This is the maximum amount of memory you will be
562					able to use with DPDK and VFIO if run as user "$TARGET_USER".
563					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
564				MEMLOCK
565				if ((MEMLOCK_AMNT < 65536)); then
566					echo ""
567					echo "## WARNING: memlock limit is less than 64MB"
568					echo -n "## DPDK with VFIO may not be able to initialize "
569					echo "if run as user \"$TARGET_USER\"."
570				fi
571			fi
572		fi
573	fi
574
575	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
576		# Some distros build msr as a module.  Make sure it's loaded to ensure
577		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
578		#  sleeps.
579		modprobe msr &> /dev/null || true
580	fi
581}
582
583function reset_linux_pci() {
584	# virtio
585	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
586	# Requires some more investigation - for example, some kernels do not seem to have
587	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
588	#  underscore vs. dash right in the virtio_scsi name.
589	modprobe virtio-pci || true
590	for bdf in "${!all_devices_d[@]}"; do
591		((all_devices_d["$bdf"] == 0)) || continue
592
593		driver=$(collect_driver "$bdf")
594		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
595			linux_bind_driver "$bdf" "$driver"
596		else
597			linux_unbind_driver "$bdf"
598		fi
599	done
600
601	echo "1" > "/sys/bus/pci/rescan"
602}
603
604function reset_linux() {
605	reset_linux_pci
606	for mount in $(linux_hugetlbfs_mounts); do
607		for hp in "$mount"/spdk*map_*; do
608			flock -n "$hp" true && rm -f "$hp"
609		done
610	done
611	rm -f /run/.spdk*
612}
613
614function status_linux() {
615	echo "Hugepages" >&2
616	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
617
618	numa_nodes=0
619	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
620		numa_nodes=$((numa_nodes + 1))
621		free_pages=$(cat $path/free_hugepages)
622		all_pages=$(cat $path/nr_hugepages)
623
624		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
625
626		node=${BASH_REMATCH[1]}
627		huge_size=${BASH_REMATCH[2]}
628
629		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
630	done
631
632	# fall back to system-wide hugepages
633	if [ "$numa_nodes" = "0" ]; then
634		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
635		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
636		node="-"
637		huge_size="$HUGEPGSZ"
638
639		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
640	fi
641
642	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
643		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
644
645	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
646
647	for bdf in "${sorted_bdfs[@]}"; do
648		driver=${drivers_d["$bdf"]}
649		if [ "$numa_nodes" = "0" ]; then
650			node="-"
651		else
652			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
653			if ((node == -1)); then
654				node=unknown
655			fi
656		fi
657		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
658			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
659		else
660			name="-"
661		fi
662
663		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
664			blknames=($(get_block_dev_from_bdf "$bdf"))
665		else
666			blknames=("-")
667		fi
668
669		desc=""
670		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
671		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
672		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
673		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
674		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
675		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
676
677		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
678			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
679			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
680	done
681}
682
683function status_freebsd() {
684	local pci
685
686	status_print() (
687		local type=$1
688		local dev driver
689
690		shift
691
692		for pci; do
693			printf '%-8s %-15s %-6s %-6s %-16s\n' \
694				"$type" \
695				"$pci" \
696				"${pci_ids_vendor["$pci"]}" \
697				"${pci_ids_device["$pci"]}" \
698				"${pci_bus_driver["$pci"]}"
699		done | sort -k2,2
700	)
701
702	local contigmem=present
703	local contigmem_buffer_size
704	local contigmem_num_buffers
705
706	if ! kldstat -q -m contigmem; then
707		contigmem="not present"
708	fi
709	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
710		contigmem_buffer_size="not set"
711	fi
712	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
713		contigmem_num_buffers="not set"
714	fi
715
716	cat <<- BSD_INFO
717		Contigmem ($contigmem)
718		Buffer Size: $contigmem_buffer_size
719		Num Buffers: $contigmem_num_buffers
720
721	BSD_INFO
722
723	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
724		"Type" "BDF" "Vendor" "Device" "Driver" >&2
725
726	status_print "NVMe" "${!nvme_d[@]}"
727	status_print "I/OAT" "${!ioat_d[@]}"
728	status_print "DSA" "${!dsa_d[@]}"
729	status_print "IAA" "${!iaa_d[@]}"
730	status_print "VMD" "${!vmd_d[@]}"
731}
732
733function configure_freebsd_pci() {
734	local BDFS
735
736	BDFS+=("${!nvme_d[@]}")
737	BDFS+=("${!ioat_d[@]}")
738	BDFS+=("${!dsa_d[@]}")
739	BDFS+=("${!iaa_d[@]}")
740	BDFS+=("${!vmd_d[@]}")
741
742	# Drop the domain part from all the addresses
743	BDFS=("${BDFS[@]#*:}")
744
745	local IFS=","
746	kldunload nic_uio.ko || true
747	kenv hw.nic_uio.bdfs="${BDFS[*]}"
748	kldload nic_uio.ko
749}
750
751function configure_freebsd() {
752	if ! check_for_driver_freebsd; then
753		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
754		return 1
755	fi
756	configure_freebsd_pci
757	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
758	#  previous value, unload contigmem so that we can reload with the new value.
759	if kldstat -q -m contigmem; then
760		# contigmem may be loaded, but the kernel environment doesn't have to
761		# be necessarily set at this point. If it isn't, kenv will fail to
762		# pick up the hw. options. Handle it.
763		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
764			contigmem_num_buffers=-1
765		fi 2> /dev/null
766		if ((contigmem_num_buffers != HUGEMEM / 256)); then
767			kldunload contigmem.ko
768		fi
769	fi
770	if ! kldstat -q -m contigmem; then
771		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
772		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
773		kldload contigmem.ko
774	fi
775}
776
777function reset_freebsd() {
778	kldunload contigmem.ko || true
779	kldunload nic_uio.ko || true
780}
781
782CMD=reset cache_pci_bus
783
784mode=$1
785
786if [ -z "$mode" ]; then
787	mode="config"
788fi
789
790: ${HUGEMEM:=2048}
791: ${PCI_ALLOWED:=""}
792: ${PCI_BLOCKED:=""}
793
794if [ -n "$NVME_ALLOWED" ]; then
795	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
796fi
797
798if [ -n "$SKIP_PCI" ]; then
799	PCI_ALLOWED="none"
800fi
801
802if [ -z "$TARGET_USER" ]; then
803	TARGET_USER="$SUDO_USER"
804	if [ -z "$TARGET_USER" ]; then
805		TARGET_USER=$(logname 2> /dev/null) || true
806	fi
807fi
808
809collect_devices "$mode"
810
811if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
812	# Note that this will wait only for the first block device attached to
813	# a given storage controller. For nvme this may miss some of the devs
814	# in case multiple namespaces are being in place.
815	# FIXME: Wait for nvme controller(s) to be in live state and determine
816	# number of configured namespaces, build list of potential block devs
817	# and pass them to sync_dev_uevents. Is it worth the effort?
818	bdfs_to_wait_for=()
819	for bdf in "${!all_devices_d[@]}"; do
820		((all_devices_d["$bdf"] == 0)) || continue
821		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
822			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
823			bdfs_to_wait_for+=("$bdf")
824		fi
825	done
826	if ((${#bdfs_to_wait_for[@]} > 0)); then
827		echo "Waiting for block devices as requested"
828		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
829		"$rootdir/scripts/sync_dev_uevents.sh" \
830			block/disk \
831			"${bdfs_to_wait_for[@]}" &
832		sync_pid=$!
833	fi
834fi
835
836if [[ $os == Linux ]]; then
837	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
838		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
839		unset -v HUGEPGSZ
840	fi
841
842	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
843	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
844	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
845
846	if [ "$mode" == "config" ]; then
847		configure_linux
848	elif [ "$mode" == "cleanup" ]; then
849		cleanup_linux
850		clear_hugepages
851	elif [ "$mode" == "reset" ]; then
852		reset_linux
853	elif [ "$mode" == "status" ]; then
854		status_linux
855	elif [ "$mode" == "help" ]; then
856		usage $0
857	else
858		usage $0 "Invalid argument '$mode'"
859	fi
860else
861	if [ "$mode" == "config" ]; then
862		configure_freebsd
863	elif [ "$mode" == "reset" ]; then
864		reset_freebsd
865	elif [ "$mode" == "cleanup" ]; then
866		echo "setup.sh cleanup function not yet supported on $os"
867	elif [ "$mode" == "status" ]; then
868		status_freebsd
869	elif [ "$mode" == "help" ]; then
870		usage $0
871	else
872		usage $0 "Invalid argument '$mode'"
873	fi
874fi
875
876if [[ -e /proc/$sync_pid/status ]]; then
877	wait "$sync_pid"
878fi
879