xref: /spdk/scripts/setup.sh (revision 723dd06eb869d6cfdc895dc29bcf439c1e41f20c)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|help]"
22	else
23		options="[config|reset|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "help              Print this help message."
48	echo
49	echo "The following environment variables can be specified."
50	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
51	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
52	echo "                  default."
53	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
54	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
55	echo "                  Uses kernel's default for hugepages size."
56	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
57	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
58	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
59	echo "                  Hugepages can be defined per node with e.g.:"
60	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
61	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
62	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
63	echo "                  setting is used."
64	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
65	echo "                  number of requested hugepages is lower from what's already"
66	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
67	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
68	echo "                  be made prior to allocation".
69	echo "PCI_ALLOWED"
70	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
71	echo "                  Each device must be specified as a full PCI address."
72	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
73	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
74	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
75	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
76	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
77	echo "                  will be bound."
78	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
79	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
80	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
81	echo "                  By default the current user will be used."
82	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
83	echo "                  bind devices to the given driver."
84	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
85	echo "PCI_BLOCK_SYNC_ON_RESET"
86	echo "                  If set in the environment, the attempt to wait for block devices associated"
87	echo "                  with given PCI device will be made upon reset"
88	exit 0
89}
90
91# In monolithic kernels the lsmod won't work. So
92# back that with a /sys/modules. We also check
93# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
94# contain needed info (like in Fedora-like OS).
95function check_for_driver() {
96	if [[ -z $1 ]]; then
97		return 0
98	fi
99
100	if lsmod | grep -q ${1//-/_}; then
101		return 1
102	fi
103
104	if [[ -d /sys/module/${1} || -d \
105		/sys/module/${1//-/_} || -d \
106		/sys/bus/pci/drivers/${1} || -d \
107		/sys/bus/pci/drivers/${1//-/_} ]]; then
108		return 2
109	fi
110	return 0
111}
112
113function check_for_driver_freebsd() {
114	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
115	local search_paths path driver
116	IFS=";" read -ra search_paths < <(kldconfig -rU)
117
118	for driver in contigmem.ko nic_uio.ko; do
119		for path in "${search_paths[@]}"; do
120			[[ -f $path/$driver ]] && continue 2
121		done
122		return 1
123	done
124	return 0
125}
126
127function pci_dev_echo() {
128	local bdf="$1"
129	shift
130	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
131}
132
133function linux_bind_driver() {
134	bdf="$1"
135	driver_name="$2"
136	old_driver_name=${drivers_d["$bdf"]:-no driver}
137
138	if [[ $driver_name == "$old_driver_name" ]]; then
139		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
140		return 0
141	fi
142
143	if [[ $old_driver_name != "no driver" ]]; then
144		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
145	fi
146
147	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
148
149	if [[ $driver_name == "none" ]]; then
150		return 0
151	fi
152
153	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
154	echo "$bdf" > "/sys/bus/pci/drivers_probe"
155	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
156
157	if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then
158		# Check if the uio_pci_generic driver is broken as it might be in
159		# some 4.18.x kernels (see centos8 for instance) - if our device
160		# didn't get a proper uio entry, fallback to igb_uio
161		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
162			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
163			drivers_d["$bdf"]="no driver"
164			# This call will override $driver_name for remaining devices as well
165			linux_bind_driver "$bdf" igb_uio
166		fi
167	fi
168
169	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
170	if [ -e "/dev/vfio/$iommu_group" ]; then
171		if [ -n "$TARGET_USER" ]; then
172			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
173		fi
174	fi
175}
176
177function linux_unbind_driver() {
178	local bdf="$1"
179	local old_driver_name=${drivers_d["$bdf"]:-no driver}
180
181	if [[ $old_driver_name == "no driver" ]]; then
182		pci_dev_echo "$bdf" "Not bound to any driver"
183		return 0
184	fi
185
186	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
187		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
188		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
189	fi
190
191	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
192}
193
194function linux_hugetlbfs_mounts() {
195	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
196}
197
198function get_block_dev_from_bdf() {
199	local bdf=$1
200	local block
201
202	for block in /sys/block/*; do
203		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
204			echo "${block##*/}"
205		fi
206	done
207}
208
209function get_used_bdf_block_devs() {
210	local bdf=$1
211	local blocks block blockp dev mount holder
212	local used
213
214	hash lsblk &> /dev/null || return 1
215	blocks=($(get_block_dev_from_bdf "$bdf"))
216
217	for block in "${blocks[@]}"; do
218		# Check if the device is hold by some other, regardless if it's mounted
219		# or not.
220		for holder in "/sys/class/block/$block"*/holders/*; do
221			[[ -e $holder ]] || continue
222			blockp=${holder%/holders*} blockp=${blockp##*/}
223			if [[ -e $holder/slaves/$blockp ]]; then
224				used+=("holder@$blockp:${holder##*/}")
225			fi
226		done
227		while read -r dev mount; do
228			if [[ -e $mount ]]; then
229				used+=("mount@$block:$dev")
230			fi
231		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
232		if ((${#used[@]} == 0)); then
233			# Make sure we check if there's any valid data present on the target device
234			# regardless if it's being actively used or not. This is mainly done to make
235			# sure we don't miss more complex setups like ZFS pools, etc.
236			if block_in_use "$block" > /dev/null; then
237				used+=("data@$block")
238			fi
239		fi
240	done
241
242	if ((${#used[@]} > 0)); then
243		printf '%s\n' "${used[@]}"
244	fi
245}
246
247function collect_devices() {
248	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
249
250	local ids dev_type dev_id bdf bdfs in_use driver
251
252	ids+="PCI_DEVICE_ID_INTEL_IOAT"
253	ids+="|PCI_DEVICE_ID_INTEL_DSA"
254	ids+="|PCI_DEVICE_ID_INTEL_IAA"
255	ids+="|PCI_DEVICE_ID_VIRTIO"
256	ids+="|PCI_DEVICE_ID_INTEL_VMD"
257	ids+="|SPDK_PCI_CLASS_NVME"
258
259	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
260
261	while read -r _ dev_type dev_id; do
262		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
263		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
264		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
265		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
266		for bdf in "${bdfs[@]}"; do
267			in_use=0
268			if [[ $1 != status ]]; then
269				if ! pci_can_use "$bdf"; then
270					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
271					in_use=1
272				fi
273				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
274					if ! verify_bdf_block_devs "$bdf"; then
275						in_use=1
276					fi
277				fi
278				if [[ $dev_type == vmd ]]; then
279					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
280						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
281						in_use=1
282					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
283						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
284							if [ "$mode" == "config" ]; then
285								cat <<- MESSAGE
286									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
287									which are attached to the kernel NVMe driver,the binding process may go faster
288									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
289									NVMe SSDs, and then run again to unbind the VMD devices."
290								MESSAGE
291							fi
292						fi
293					fi
294				fi
295			fi
296			eval "${dev_type}_d[$bdf]=$in_use"
297			all_devices_d["$bdf"]=$in_use
298			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
299				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
300				drivers_d["$bdf"]=${driver##*/}
301			fi
302		done
303	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
304}
305
306function collect_driver() {
307	local bdf=$1
308	local drivers driver
309
310	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
311		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
312		# Pick first entry in case multiple aliases are bound to a driver.
313		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
314		driver=${driver##*/}
315	else
316		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
317		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
318		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
319		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
320		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
321		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
322	fi 2> /dev/null
323	echo "$driver"
324}
325
326function verify_bdf_block_devs() {
327	local bdf=$1
328	local blknames
329	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
330
331	if ((${#blknames[@]} > 0)); then
332		local IFS=","
333		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
334		return 1
335	fi
336}
337
338function configure_linux_pci() {
339	local driver_path=""
340	driver_name=""
341	igb_uio_fallback=""
342
343	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
344		# igb_uio is a common driver to override with and it depends on uio.
345		modprobe uio || true
346		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
347			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
348		fi
349	fi
350
351	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
352		driver_name=none
353	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
354		driver_path="$DRIVER_OVERRIDE"
355		driver_name="${DRIVER_OVERRIDE##*/}"
356		# modprobe and the sysfs don't use the .ko suffix.
357		driver_name=${driver_name%.ko}
358		# path = name -> there is no path
359		if [[ "$driver_path" = "$driver_name" ]]; then
360			driver_path=""
361		fi
362	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
363	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
364	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
365		driver_name=vfio-pci
366		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
367		# should be done automatically by modprobe since this particular module should
368		# be a part of vfio-pci dependencies, however, on some distros, it seems that
369		# it's not the case. See #1689.
370		if modinfo vfio_iommu_type1 > /dev/null; then
371			modprobe vfio_iommu_type1
372		fi
373	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
374		driver_name=uio_pci_generic
375	elif [[ -e $igb_uio_fallback ]]; then
376		driver_path="$igb_uio_fallback"
377		driver_name="igb_uio"
378		echo "WARNING: uio_pci_generic not detected - using $driver_name"
379	else
380		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
381		return 1
382	fi
383
384	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
385	if [[ $driver_name != "none" ]]; then
386		if [[ -n "$driver_path" ]]; then
387			insmod $driver_path || true
388		else
389			modprobe $driver_name
390		fi
391	fi
392
393	for bdf in "${!all_devices_d[@]}"; do
394		if ((all_devices_d["$bdf"] == 0)); then
395			if [[ -n ${nvme_d["$bdf"]} ]]; then
396				# Some nvme controllers may take significant amount of time while being
397				# unbound from the driver. Put that task into background to speed up the
398				# whole process. Currently this is done only for the devices bound to the
399				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
400				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
401				linux_bind_driver "$bdf" "$driver_name" &
402			else
403				linux_bind_driver "$bdf" "$driver_name"
404			fi
405		fi
406	done
407	wait
408
409	echo "1" > "/sys/bus/pci/rescan"
410}
411
412function cleanup_linux() {
413	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
414	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
415
416	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
417	if [[ -d $XDG_RUNTIME_DIR ]]; then
418		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
419	fi
420
421	for dir in "${dirs_to_clean[@]}"; do
422		files_to_clean+=("$dir/"*)
423	done
424	file_locks+=(/var/tmp/spdk_pci_lock*)
425	file_locks+=(/var/tmp/spdk_cpu_lock*)
426
427	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
428	files_to_clean+=("${file_locks[@]}")
429
430	# This may fail in case path that readlink attempts to resolve suddenly
431	# disappears (as it may happen with terminating processes).
432	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
433
434	if ((${#opened_files[@]} == 0)); then
435		echo "Can't get list of opened files!"
436		exit 1
437	fi
438
439	echo 'Cleaning'
440	for f in "${files_to_clean[@]}"; do
441		[[ -e $f ]] || continue
442		if [[ ${opened_files[*]} != *"$f"* ]]; then
443			echo "Removing:    $f"
444			rm $f
445		else
446			echo "Still open: $f"
447		fi
448	done
449
450	for dir in "${dirs_to_clean[@]}"; do
451		[[ -d $dir ]] || continue
452		if [[ ${opened_files[*]} != *"$dir"* ]]; then
453			echo "Removing:    $dir"
454			rmdir $dir
455		else
456			echo "Still open: $dir"
457		fi
458	done
459	echo "Clean"
460}
461
462check_hugepages_alloc() {
463	local hp_int=$1
464	local allocated_hugepages
465
466	allocated_hugepages=$(< "$hp_int")
467
468	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
469		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
470		return 0
471	fi
472
473	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
474
475	allocated_hugepages=$(< "$hp_int")
476	if ((allocated_hugepages < NRHUGE)); then
477		cat <<- ERROR
478
479			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
480			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
481		ERROR
482		return 1
483	fi
484}
485
486clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
487
488configure_linux_hugepages() {
489	local node system_nodes
490	local nodes_to_use nodes_hp
491
492	if [[ $CLEAR_HUGE == yes ]]; then
493		clear_hugepages
494	fi
495
496	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
497		clear_hugepages
498		check_hugepages_alloc /proc/sys/vm/nr_hugepages
499		return 0
500	fi
501
502	for node in /sys/devices/system/node/node*; do
503		[[ -e $node ]] || continue
504		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
505	done
506
507	if ((${#nodes[@]} == 0)); then
508		# No NUMA support? Fallback to common interface
509		check_hugepages_alloc /proc/sys/vm/nr_hugepages
510		return 0
511	fi
512
513	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
514	if ((${#nodes_to_use[@]} == 0)); then
515		nodes_to_use[0]=0
516	fi
517
518	# Align indexes with node ids
519	for node in "${!nodes_to_use[@]}"; do
520		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
521			eval "${nodes_to_use[node]}"
522		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
523			nodes_hp[nodes_to_use[node]]=$NRHUGE
524		fi
525	done
526
527	for node in "${!nodes_hp[@]}"; do
528		if [[ -z ${nodes[node]} ]]; then
529			echo "Node $node doesn't exist, ignoring" >&2
530			continue
531		fi
532		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
533	done
534}
535
536function configure_linux() {
537	configure_linux_pci
538	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
539
540	if [ -z "$hugetlbfs_mounts" ]; then
541		hugetlbfs_mounts=/mnt/huge
542		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
543		mkdir -p "$hugetlbfs_mounts"
544		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
545	fi
546
547	configure_linux_hugepages
548
549	if [ "$driver_name" = "vfio-pci" ]; then
550		if [ -n "$TARGET_USER" ]; then
551			for mount in $hugetlbfs_mounts; do
552				chown "$TARGET_USER" "$mount"
553				chmod g+w "$mount"
554			done
555
556			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
557			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
558				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
559				cat <<- MEMLOCK
560					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
561
562					This is the maximum amount of memory you will be
563					able to use with DPDK and VFIO if run as user "$TARGET_USER".
564					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
565				MEMLOCK
566				if ((MEMLOCK_AMNT < 65536)); then
567					echo ""
568					echo "## WARNING: memlock limit is less than 64MB"
569					echo -n "## DPDK with VFIO may not be able to initialize "
570					echo "if run as user \"$TARGET_USER\"."
571				fi
572			fi
573		fi
574	fi
575
576	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
577		# Some distros build msr as a module.  Make sure it's loaded to ensure
578		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
579		#  sleeps.
580		modprobe msr &> /dev/null || true
581	fi
582}
583
584function reset_linux_pci() {
585	# virtio
586	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
587	# Requires some more investigation - for example, some kernels do not seem to have
588	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
589	#  underscore vs. dash right in the virtio_scsi name.
590	modprobe virtio-pci || true
591	for bdf in "${!all_devices_d[@]}"; do
592		((all_devices_d["$bdf"] == 0)) || continue
593
594		driver=$(collect_driver "$bdf")
595		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
596			linux_bind_driver "$bdf" "$driver"
597		else
598			linux_unbind_driver "$bdf"
599		fi
600	done
601
602	echo "1" > "/sys/bus/pci/rescan"
603}
604
605function reset_linux() {
606	reset_linux_pci
607	for mount in $(linux_hugetlbfs_mounts); do
608		for hp in "$mount"/spdk*map_*; do
609			flock -n "$hp" true && rm -f "$hp"
610		done
611	done
612	rm -f /run/.spdk*
613}
614
615function status_linux() {
616	echo "Hugepages" >&2
617	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
618
619	numa_nodes=0
620	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
621		numa_nodes=$((numa_nodes + 1))
622		free_pages=$(cat $path/free_hugepages)
623		all_pages=$(cat $path/nr_hugepages)
624
625		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
626
627		node=${BASH_REMATCH[1]}
628		huge_size=${BASH_REMATCH[2]}
629
630		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
631	done
632
633	# fall back to system-wide hugepages
634	if [ "$numa_nodes" = "0" ]; then
635		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
636		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
637		node="-"
638		huge_size="$HUGEPGSZ"
639
640		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
641	fi
642
643	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
644		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
645
646	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
647
648	for bdf in "${sorted_bdfs[@]}"; do
649		driver=${drivers_d["$bdf"]}
650		if [ "$numa_nodes" = "0" ]; then
651			node="-"
652		else
653			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
654			if ((node == -1)); then
655				node=unknown
656			fi
657		fi
658		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
659			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
660		else
661			name="-"
662		fi
663
664		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
665			blknames=($(get_block_dev_from_bdf "$bdf"))
666		else
667			blknames=("-")
668		fi
669
670		desc=""
671		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
672		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
673		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
674		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
675		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
676		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
677
678		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
679			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
680			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
681	done
682}
683
684function status_freebsd() {
685	local pci
686
687	status_print() (
688		local type=$1
689		local dev driver
690
691		shift
692
693		for pci; do
694			printf '%-8s %-15s %-6s %-6s %-16s\n' \
695				"$type" \
696				"$pci" \
697				"${pci_ids_vendor["$pci"]}" \
698				"${pci_ids_device["$pci"]}" \
699				"${pci_bus_driver["$pci"]}"
700		done | sort -k2,2
701	)
702
703	local contigmem=present
704	local contigmem_buffer_size
705	local contigmem_num_buffers
706
707	if ! kldstat -q -m contigmem; then
708		contigmem="not present"
709	fi
710	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
711		contigmem_buffer_size="not set"
712	fi
713	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
714		contigmem_num_buffers="not set"
715	fi
716
717	cat <<- BSD_INFO
718		Contigmem ($contigmem)
719		Buffer Size: $contigmem_buffer_size
720		Num Buffers: $contigmem_num_buffers
721
722	BSD_INFO
723
724	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
725		"Type" "BDF" "Vendor" "Device" "Driver" >&2
726
727	status_print "NVMe" "${!nvme_d[@]}"
728	status_print "I/OAT" "${!ioat_d[@]}"
729	status_print "DSA" "${!dsa_d[@]}"
730	status_print "IAA" "${!iaa_d[@]}"
731	status_print "VMD" "${!vmd_d[@]}"
732}
733
734function configure_freebsd_pci() {
735	local BDFS
736
737	BDFS+=("${!nvme_d[@]}")
738	BDFS+=("${!ioat_d[@]}")
739	BDFS+=("${!dsa_d[@]}")
740	BDFS+=("${!iaa_d[@]}")
741	BDFS+=("${!vmd_d[@]}")
742
743	# Drop the domain part from all the addresses
744	BDFS=("${BDFS[@]#*:}")
745
746	local IFS=","
747	kldunload nic_uio.ko || true
748	kenv hw.nic_uio.bdfs="${BDFS[*]}"
749	kldload nic_uio.ko
750}
751
752function configure_freebsd() {
753	if ! check_for_driver_freebsd; then
754		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
755		return 1
756	fi
757	configure_freebsd_pci
758	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
759	#  previous value, unload contigmem so that we can reload with the new value.
760	if kldstat -q -m contigmem; then
761		# contigmem may be loaded, but the kernel environment doesn't have to
762		# be necessarily set at this point. If it isn't, kenv will fail to
763		# pick up the hw. options. Handle it.
764		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
765			contigmem_num_buffers=-1
766		fi 2> /dev/null
767		if ((contigmem_num_buffers != HUGEMEM / 256)); then
768			kldunload contigmem.ko
769		fi
770	fi
771	if ! kldstat -q -m contigmem; then
772		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
773		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
774		kldload contigmem.ko
775	fi
776}
777
778function reset_freebsd() {
779	kldunload contigmem.ko || true
780	kldunload nic_uio.ko || true
781}
782
783CMD=reset cache_pci_bus
784
785mode=$1
786
787if [ -z "$mode" ]; then
788	mode="config"
789fi
790
791: ${HUGEMEM:=2048}
792: ${PCI_ALLOWED:=""}
793: ${PCI_BLOCKED:=""}
794
795if [ -n "$NVME_ALLOWED" ]; then
796	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
797fi
798
799if [ -n "$SKIP_PCI" ]; then
800	PCI_ALLOWED="none"
801fi
802
803if [ -z "$TARGET_USER" ]; then
804	TARGET_USER="$SUDO_USER"
805	if [ -z "$TARGET_USER" ]; then
806		TARGET_USER=$(logname 2> /dev/null) || true
807	fi
808fi
809
810collect_devices "$mode"
811
812if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
813	# Note that this will wait only for the first block device attached to
814	# a given storage controller. For nvme this may miss some of the devs
815	# in case multiple namespaces are being in place.
816	# FIXME: Wait for nvme controller(s) to be in live state and determine
817	# number of configured namespaces, build list of potential block devs
818	# and pass them to sync_dev_uevents. Is it worth the effort?
819	bdfs_to_wait_for=()
820	for bdf in "${!all_devices_d[@]}"; do
821		((all_devices_d["$bdf"] == 0)) || continue
822		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
823			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
824			bdfs_to_wait_for+=("$bdf")
825		fi
826	done
827	if ((${#bdfs_to_wait_for[@]} > 0)); then
828		echo "Waiting for block devices as requested"
829		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
830		"$rootdir/scripts/sync_dev_uevents.sh" \
831			block/disk \
832			"${bdfs_to_wait_for[@]}" &
833		sync_pid=$!
834	fi
835fi
836
837if [[ $os == Linux ]]; then
838	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
839		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
840		unset -v HUGEPGSZ
841	fi
842
843	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
844	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
845	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
846
847	if [ "$mode" == "config" ]; then
848		configure_linux
849	elif [ "$mode" == "cleanup" ]; then
850		cleanup_linux
851		clear_hugepages
852	elif [ "$mode" == "reset" ]; then
853		reset_linux
854	elif [ "$mode" == "status" ]; then
855		status_linux
856	elif [ "$mode" == "help" ]; then
857		usage $0
858	else
859		usage $0 "Invalid argument '$mode'"
860	fi
861else
862	if [ "$mode" == "config" ]; then
863		configure_freebsd
864	elif [ "$mode" == "reset" ]; then
865		reset_freebsd
866	elif [ "$mode" == "cleanup" ]; then
867		echo "setup.sh cleanup function not yet supported on $os"
868	elif [ "$mode" == "status" ]; then
869		status_freebsd
870	elif [ "$mode" == "help" ]; then
871		usage $0
872	else
873		usage $0 "Invalid argument '$mode'"
874	fi
875fi
876
877if [[ -e /proc/$sync_pid/status ]]; then
878	wait "$sync_pid"
879fi
880