xref: /spdk/scripts/setup.sh (revision 877573897ad52be4fa8989f7617bd655b87e05c4)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|help]"
22	else
23		options="[config|reset|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "help              Print this help message."
48	echo
49	echo "The following environment variables can be specified."
50	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
51	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
52	echo "                  default."
53	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
54	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
55	echo "                  Uses kernel's default for hugepages size."
56	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
57	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
58	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
59	echo "                  Hugepages can be defined per node with e.g.:"
60	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
61	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
62	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
63	echo "                  setting is used."
64	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
65	echo "                  number of requested hugepages is lower from what's already"
66	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
67	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
68	echo "                  be made prior to allocation".
69	echo "PCI_ALLOWED"
70	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
71	echo "                  Each device must be specified as a full PCI address."
72	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
73	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
74	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
75	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
76	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
77	echo "                  will be bound."
78	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
79	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
80	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
81	echo "                  By default the current user will be used."
82	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
83	echo "                  bind devices to the given driver."
84	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
85	echo "PCI_BLOCK_SYNC_ON_RESET"
86	echo "                  If set in the environment, the attempt to wait for block devices associated"
87	echo "                  with given PCI device will be made upon reset"
88	exit 0
89}
90
91# In monolithic kernels the lsmod won't work. So
92# back that with a /sys/modules. We also check
93# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
94# contain needed info (like in Fedora-like OS).
95function check_for_driver() {
96	if [[ -z $1 ]]; then
97		return 0
98	fi
99
100	if lsmod | grep -q ${1//-/_}; then
101		return 1
102	fi
103
104	if [[ -d /sys/module/${1} || -d \
105		/sys/module/${1//-/_} || -d \
106		/sys/bus/pci/drivers/${1} || -d \
107		/sys/bus/pci/drivers/${1//-/_} ]]; then
108		return 2
109	fi
110	return 0
111}
112
113function check_for_driver_freebsd() {
114	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
115	local search_paths path driver
116	IFS=";" read -ra search_paths < <(kldconfig -rU)
117
118	for driver in contigmem.ko nic_uio.ko; do
119		for path in "${search_paths[@]}"; do
120			[[ -f $path/$driver ]] && continue 2
121		done
122		return 1
123	done
124	return 0
125}
126
127function pci_dev_echo() {
128	local bdf="$1"
129	shift
130	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
131}
132
133function linux_bind_driver() {
134	bdf="$1"
135	driver_name="$2"
136	old_driver_name=${drivers_d["$bdf"]:-no driver}
137
138	if [[ $driver_name == "$old_driver_name" ]]; then
139		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
140		return 0
141	fi
142
143	if [[ $old_driver_name != "no driver" ]]; then
144		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
145	fi
146
147	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
148
149	if [[ $driver_name == "none" ]]; then
150		return 0
151	fi
152
153	local probe_attempts=0
154	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
155	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
156		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
157		sleep 0.5
158	done 2> /dev/null
159
160	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
161
162	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
163		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
164		return 1
165	fi
166
167	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
168	if [ -e "/dev/vfio/$iommu_group" ]; then
169		if [ -n "$TARGET_USER" ]; then
170			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
171		fi
172	fi
173}
174
175function linux_unbind_driver() {
176	local bdf="$1"
177	local old_driver_name=${drivers_d["$bdf"]:-no driver}
178
179	if [[ $old_driver_name == "no driver" ]]; then
180		pci_dev_echo "$bdf" "Not bound to any driver"
181		return 0
182	fi
183
184	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
185		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
186		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
187	fi
188
189	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
190}
191
192function linux_hugetlbfs_mounts() {
193	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
194}
195
196function get_block_dev_from_bdf() {
197	local bdf=$1
198	local block
199
200	for block in /sys/block/*; do
201		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
202			echo "${block##*/}"
203		fi
204	done
205}
206
207function get_used_bdf_block_devs() {
208	local bdf=$1
209	local blocks block blockp dev mount holder
210	local used
211
212	hash lsblk &> /dev/null || return 1
213	blocks=($(get_block_dev_from_bdf "$bdf"))
214
215	for block in "${blocks[@]}"; do
216		# Check if the device is hold by some other, regardless if it's mounted
217		# or not.
218		for holder in "/sys/class/block/$block"*/holders/*; do
219			[[ -e $holder ]] || continue
220			blockp=${holder%/holders*} blockp=${blockp##*/}
221			if [[ -e $holder/slaves/$blockp ]]; then
222				used+=("holder@$blockp:${holder##*/}")
223			fi
224		done
225		while read -r dev mount; do
226			if [[ -e $mount ]]; then
227				used+=("mount@$block:$dev")
228			fi
229		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
230		if ((${#used[@]} == 0)); then
231			# Make sure we check if there's any valid data present on the target device
232			# regardless if it's being actively used or not. This is mainly done to make
233			# sure we don't miss more complex setups like ZFS pools, etc.
234			if block_in_use "$block" > /dev/null; then
235				used+=("data@$block")
236			fi
237		fi
238	done
239
240	if ((${#used[@]} > 0)); then
241		printf '%s\n' "${used[@]}"
242	fi
243}
244
245function collect_devices() {
246	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
247
248	local ids dev_type dev_id bdf bdfs in_use driver
249
250	ids+="PCI_DEVICE_ID_INTEL_IOAT"
251	ids+="|PCI_DEVICE_ID_INTEL_DSA"
252	ids+="|PCI_DEVICE_ID_INTEL_IAA"
253	ids+="|PCI_DEVICE_ID_VIRTIO"
254	ids+="|PCI_DEVICE_ID_INTEL_VMD"
255	ids+="|SPDK_PCI_CLASS_NVME"
256
257	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
258
259	while read -r _ dev_type dev_id; do
260		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
261		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
262		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
263		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
264		for bdf in "${bdfs[@]}"; do
265			in_use=0
266			if [[ $1 != status ]]; then
267				if ! pci_can_use "$bdf"; then
268					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
269					in_use=1
270				fi
271				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
272					if ! verify_bdf_block_devs "$bdf"; then
273						in_use=1
274					fi
275				fi
276				if [[ $dev_type == vmd ]]; then
277					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
278						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
279						in_use=1
280					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
281						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
282							if [ "$mode" == "config" ]; then
283								cat <<- MESSAGE
284									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
285									which are attached to the kernel NVMe driver,the binding process may go faster
286									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
287									NVMe SSDs, and then run again to unbind the VMD devices."
288								MESSAGE
289							fi
290						fi
291					fi
292				fi
293			fi
294			eval "${dev_type}_d[$bdf]=$in_use"
295			all_devices_d["$bdf"]=$in_use
296			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
297				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
298				drivers_d["$bdf"]=${driver##*/}
299			fi
300		done
301	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
302}
303
304function collect_driver() {
305	local bdf=$1
306	local drivers driver
307
308	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
309		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
310		# Pick first entry in case multiple aliases are bound to a driver.
311		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
312		driver=${driver##*/}
313	else
314		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
315		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
316		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
317		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
318		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
319		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
320	fi 2> /dev/null
321	echo "$driver"
322}
323
324function verify_bdf_block_devs() {
325	local bdf=$1
326	local blknames
327	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
328
329	if ((${#blknames[@]} > 0)); then
330		local IFS=","
331		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
332		return 1
333	fi
334}
335
336function configure_linux_pci() {
337	local driver_path=""
338	driver_name=""
339	igb_uio_fallback=""
340
341	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
342		# igb_uio is a common driver to override with and it depends on uio.
343		modprobe uio || true
344		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
345			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
346		fi
347	fi
348
349	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
350		driver_name=none
351	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
352		driver_path="$DRIVER_OVERRIDE"
353		driver_name="${DRIVER_OVERRIDE##*/}"
354		# modprobe and the sysfs don't use the .ko suffix.
355		driver_name=${driver_name%.ko}
356		# path = name -> there is no path
357		if [[ "$driver_path" = "$driver_name" ]]; then
358			driver_path=""
359		fi
360	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
361	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
362	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
363		driver_name=vfio-pci
364		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
365		# should be done automatically by modprobe since this particular module should
366		# be a part of vfio-pci dependencies, however, on some distros, it seems that
367		# it's not the case. See #1689.
368		if modinfo vfio_iommu_type1 > /dev/null; then
369			modprobe vfio_iommu_type1
370		fi
371	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
372		driver_name=uio_pci_generic
373	elif [[ -e $igb_uio_fallback ]]; then
374		driver_path="$igb_uio_fallback"
375		driver_name="igb_uio"
376		echo "WARNING: uio_pci_generic not detected - using $driver_name"
377	else
378		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
379		return 1
380	fi
381
382	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
383	if [[ $driver_name != "none" ]]; then
384		if [[ -n "$driver_path" ]]; then
385			insmod $driver_path || true
386		else
387			modprobe $driver_name
388		fi
389	fi
390
391	for bdf in "${!all_devices_d[@]}"; do
392		if ((all_devices_d["$bdf"] == 0)); then
393			if [[ -n ${nvme_d["$bdf"]} ]]; then
394				# Some nvme controllers may take significant amount of time while being
395				# unbound from the driver. Put that task into background to speed up the
396				# whole process. Currently this is done only for the devices bound to the
397				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
398				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
399				linux_bind_driver "$bdf" "$driver_name" &
400			else
401				linux_bind_driver "$bdf" "$driver_name"
402			fi
403		fi
404	done
405	wait
406
407	echo "1" > "/sys/bus/pci/rescan"
408}
409
410function cleanup_linux() {
411	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
412	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
413
414	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
415	if [[ -d $XDG_RUNTIME_DIR ]]; then
416		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
417	fi
418
419	for dir in "${dirs_to_clean[@]}"; do
420		files_to_clean+=("$dir/"*)
421	done
422	file_locks+=(/var/tmp/spdk_pci_lock*)
423	file_locks+=(/var/tmp/spdk_cpu_lock*)
424
425	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
426	files_to_clean+=("${file_locks[@]}")
427
428	# This may fail in case path that readlink attempts to resolve suddenly
429	# disappears (as it may happen with terminating processes).
430	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
431
432	if ((${#opened_files[@]} == 0)); then
433		echo "Can't get list of opened files!"
434		exit 1
435	fi
436
437	echo 'Cleaning'
438	for f in "${files_to_clean[@]}"; do
439		[[ -e $f ]] || continue
440		if [[ ${opened_files[*]} != *"$f"* ]]; then
441			echo "Removing:    $f"
442			rm $f
443		else
444			echo "Still open: $f"
445		fi
446	done
447
448	for dir in "${dirs_to_clean[@]}"; do
449		[[ -d $dir ]] || continue
450		if [[ ${opened_files[*]} != *"$dir"* ]]; then
451			echo "Removing:    $dir"
452			rmdir $dir
453		else
454			echo "Still open: $dir"
455		fi
456	done
457	echo "Clean"
458}
459
460check_hugepages_alloc() {
461	local hp_int=$1
462	local allocated_hugepages
463
464	allocated_hugepages=$(< "$hp_int")
465
466	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
467		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
468		return 0
469	fi
470
471	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
472
473	allocated_hugepages=$(< "$hp_int")
474	if ((allocated_hugepages < NRHUGE)); then
475		cat <<- ERROR
476
477			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
478			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
479		ERROR
480		return 1
481	fi
482}
483
484clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
485
486configure_linux_hugepages() {
487	local node system_nodes
488	local nodes_to_use nodes_hp
489
490	if [[ $CLEAR_HUGE == yes ]]; then
491		clear_hugepages
492	fi
493
494	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
495		clear_hugepages
496		check_hugepages_alloc /proc/sys/vm/nr_hugepages
497		return 0
498	fi
499
500	for node in /sys/devices/system/node/node*; do
501		[[ -e $node ]] || continue
502		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
503	done
504
505	if ((${#nodes[@]} == 0)); then
506		# No NUMA support? Fallback to common interface
507		check_hugepages_alloc /proc/sys/vm/nr_hugepages
508		return 0
509	fi
510
511	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
512	if ((${#nodes_to_use[@]} == 0)); then
513		nodes_to_use[0]=0
514	fi
515
516	# Align indexes with node ids
517	for node in "${!nodes_to_use[@]}"; do
518		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
519			eval "${nodes_to_use[node]}"
520		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
521			nodes_hp[nodes_to_use[node]]=$NRHUGE
522		fi
523	done
524
525	for node in "${!nodes_hp[@]}"; do
526		if [[ -z ${nodes[node]} ]]; then
527			echo "Node $node doesn't exist, ignoring" >&2
528			continue
529		fi
530		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
531	done
532}
533
534function configure_linux() {
535	configure_linux_pci
536	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
537
538	if [ -z "$hugetlbfs_mounts" ]; then
539		hugetlbfs_mounts=/mnt/huge
540		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
541		mkdir -p "$hugetlbfs_mounts"
542		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
543	fi
544
545	configure_linux_hugepages
546
547	if [ "$driver_name" = "vfio-pci" ]; then
548		if [ -n "$TARGET_USER" ]; then
549			for mount in $hugetlbfs_mounts; do
550				chown "$TARGET_USER" "$mount"
551				chmod g+w "$mount"
552			done
553
554			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
555			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
556				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
557				cat <<- MEMLOCK
558					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
559
560					This is the maximum amount of memory you will be
561					able to use with DPDK and VFIO if run as user "$TARGET_USER".
562					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
563				MEMLOCK
564				if ((MEMLOCK_AMNT < 65536)); then
565					echo ""
566					echo "## WARNING: memlock limit is less than 64MB"
567					echo -n "## DPDK with VFIO may not be able to initialize "
568					echo "if run as user \"$TARGET_USER\"."
569				fi
570			fi
571		fi
572	fi
573
574	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
575		# Some distros build msr as a module.  Make sure it's loaded to ensure
576		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
577		#  sleeps.
578		modprobe msr &> /dev/null || true
579	fi
580}
581
582function reset_linux_pci() {
583	# virtio
584	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
585	# Requires some more investigation - for example, some kernels do not seem to have
586	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
587	#  underscore vs. dash right in the virtio_scsi name.
588	modprobe virtio-pci || true
589	for bdf in "${!all_devices_d[@]}"; do
590		((all_devices_d["$bdf"] == 0)) || continue
591
592		driver=$(collect_driver "$bdf")
593		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
594			linux_bind_driver "$bdf" "$driver"
595		else
596			linux_unbind_driver "$bdf"
597		fi
598	done
599
600	echo "1" > "/sys/bus/pci/rescan"
601}
602
603function reset_linux() {
604	reset_linux_pci
605	for mount in $(linux_hugetlbfs_mounts); do
606		for hp in "$mount"/spdk*map_*; do
607			flock -n "$hp" true && rm -f "$hp"
608		done
609	done
610	rm -f /run/.spdk*
611}
612
613function status_linux() {
614	echo "Hugepages" >&2
615	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
616
617	numa_nodes=0
618	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
619		numa_nodes=$((numa_nodes + 1))
620		free_pages=$(cat $path/free_hugepages)
621		all_pages=$(cat $path/nr_hugepages)
622
623		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
624
625		node=${BASH_REMATCH[1]}
626		huge_size=${BASH_REMATCH[2]}
627
628		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
629	done
630
631	# fall back to system-wide hugepages
632	if [ "$numa_nodes" = "0" ]; then
633		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
634		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
635		node="-"
636		huge_size="$HUGEPGSZ"
637
638		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
639	fi
640
641	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
642		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
643
644	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
645
646	for bdf in "${sorted_bdfs[@]}"; do
647		driver=${drivers_d["$bdf"]}
648		if [ "$numa_nodes" = "0" ]; then
649			node="-"
650		else
651			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
652			if ((node == -1)); then
653				node=unknown
654			fi
655		fi
656		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
657			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
658		else
659			name="-"
660		fi
661
662		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
663			blknames=($(get_block_dev_from_bdf "$bdf"))
664		else
665			blknames=("-")
666		fi
667
668		desc=""
669		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
670		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
671		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
672		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
673		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
674		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
675
676		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
677			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
678			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
679	done
680}
681
682function status_freebsd() {
683	local pci
684
685	status_print() (
686		local type=$1
687		local dev driver
688
689		shift
690
691		for pci; do
692			printf '%-8s %-15s %-6s %-6s %-16s\n' \
693				"$type" \
694				"$pci" \
695				"${pci_ids_vendor["$pci"]}" \
696				"${pci_ids_device["$pci"]}" \
697				"${pci_bus_driver["$pci"]}"
698		done | sort -k2,2
699	)
700
701	local contigmem=present
702	local contigmem_buffer_size
703	local contigmem_num_buffers
704
705	if ! kldstat -q -m contigmem; then
706		contigmem="not present"
707	fi
708	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
709		contigmem_buffer_size="not set"
710	fi
711	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
712		contigmem_num_buffers="not set"
713	fi
714
715	cat <<- BSD_INFO
716		Contigmem ($contigmem)
717		Buffer Size: $contigmem_buffer_size
718		Num Buffers: $contigmem_num_buffers
719
720	BSD_INFO
721
722	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
723		"Type" "BDF" "Vendor" "Device" "Driver" >&2
724
725	status_print "NVMe" "${!nvme_d[@]}"
726	status_print "I/OAT" "${!ioat_d[@]}"
727	status_print "DSA" "${!dsa_d[@]}"
728	status_print "IAA" "${!iaa_d[@]}"
729	status_print "VMD" "${!vmd_d[@]}"
730}
731
732function configure_freebsd_pci() {
733	local BDFS
734
735	BDFS+=("${!nvme_d[@]}")
736	BDFS+=("${!ioat_d[@]}")
737	BDFS+=("${!dsa_d[@]}")
738	BDFS+=("${!iaa_d[@]}")
739	BDFS+=("${!vmd_d[@]}")
740
741	# Drop the domain part from all the addresses
742	BDFS=("${BDFS[@]#*:}")
743
744	local IFS=","
745	kldunload nic_uio.ko || true
746	kenv hw.nic_uio.bdfs="${BDFS[*]}"
747	kldload nic_uio.ko
748}
749
750function configure_freebsd() {
751	if ! check_for_driver_freebsd; then
752		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
753		return 1
754	fi
755	configure_freebsd_pci
756	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
757	#  previous value, unload contigmem so that we can reload with the new value.
758	if kldstat -q -m contigmem; then
759		# contigmem may be loaded, but the kernel environment doesn't have to
760		# be necessarily set at this point. If it isn't, kenv will fail to
761		# pick up the hw. options. Handle it.
762		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
763			contigmem_num_buffers=-1
764		fi 2> /dev/null
765		if ((contigmem_num_buffers != HUGEMEM / 256)); then
766			kldunload contigmem.ko
767		fi
768	fi
769	if ! kldstat -q -m contigmem; then
770		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
771		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
772		kldload contigmem.ko
773	fi
774}
775
776function reset_freebsd() {
777	kldunload contigmem.ko || true
778	kldunload nic_uio.ko || true
779}
780
781CMD=reset cache_pci_bus
782
783mode=$1
784
785if [ -z "$mode" ]; then
786	mode="config"
787fi
788
789: ${HUGEMEM:=2048}
790: ${PCI_ALLOWED:=""}
791: ${PCI_BLOCKED:=""}
792
793if [ -n "$NVME_ALLOWED" ]; then
794	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
795fi
796
797if [ -n "$SKIP_PCI" ]; then
798	PCI_ALLOWED="none"
799fi
800
801if [ -z "$TARGET_USER" ]; then
802	TARGET_USER="$SUDO_USER"
803	if [ -z "$TARGET_USER" ]; then
804		TARGET_USER=$(logname 2> /dev/null) || true
805	fi
806fi
807
808collect_devices "$mode"
809
810if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
811	# Note that this will wait only for the first block device attached to
812	# a given storage controller. For nvme this may miss some of the devs
813	# in case multiple namespaces are being in place.
814	# FIXME: Wait for nvme controller(s) to be in live state and determine
815	# number of configured namespaces, build list of potential block devs
816	# and pass them to sync_dev_uevents. Is it worth the effort?
817	bdfs_to_wait_for=()
818	for bdf in "${!all_devices_d[@]}"; do
819		((all_devices_d["$bdf"] == 0)) || continue
820		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
821			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
822			bdfs_to_wait_for+=("$bdf")
823		fi
824	done
825	if ((${#bdfs_to_wait_for[@]} > 0)); then
826		echo "Waiting for block devices as requested"
827		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
828		"$rootdir/scripts/sync_dev_uevents.sh" \
829			block/disk \
830			"${bdfs_to_wait_for[@]}" &
831		sync_pid=$!
832	fi
833fi
834
835if [[ $os == Linux ]]; then
836	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
837		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
838		unset -v HUGEPGSZ
839	fi
840
841	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
842	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
843	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
844
845	if [ "$mode" == "config" ]; then
846		configure_linux
847	elif [ "$mode" == "cleanup" ]; then
848		cleanup_linux
849		clear_hugepages
850	elif [ "$mode" == "reset" ]; then
851		reset_linux
852	elif [ "$mode" == "status" ]; then
853		status_linux
854	elif [ "$mode" == "help" ]; then
855		usage $0
856	else
857		usage $0 "Invalid argument '$mode'"
858	fi
859else
860	if [ "$mode" == "config" ]; then
861		configure_freebsd
862	elif [ "$mode" == "reset" ]; then
863		reset_freebsd
864	elif [ "$mode" == "cleanup" ]; then
865		echo "setup.sh cleanup function not yet supported on $os"
866	elif [ "$mode" == "status" ]; then
867		status_freebsd
868	elif [ "$mode" == "help" ]; then
869		usage $0
870	else
871		usage $0 "Invalid argument '$mode'"
872	fi
873fi
874
875if [[ -e /proc/$sync_pid/status ]]; then
876	wait "$sync_pid"
877fi
878