xref: /spdk/scripts/setup.sh (revision 6b79f76769c83dacf162ff2ca2cf1cf133896835)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2016 Intel Corporation
4#  All rights reserved.
5#
6set -e
7shopt -s nullglob extglob
8
9os=$(uname -s)
10
11if [[ $os != Linux && $os != FreeBSD ]]; then
12	echo "Not supported platform ($os), aborting"
13	exit 1
14fi
15
16rootdir=$(readlink -f $(dirname $0))/..
17source "$rootdir/scripts/common.sh"
18
19function usage() {
20	if [[ $os == Linux ]]; then
21		options="[config|reset|status|cleanup|help]"
22	else
23		options="[config|reset|help]"
24	fi
25
26	[[ -n $2 ]] && (
27		echo "$2"
28		echo ""
29	)
30	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
31	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
32	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
33	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
34	echo "Usage: $(basename $1) $options"
35	echo
36	echo "$options - as following:"
37	echo "config            Default mode. Allocate hugepages and bind PCI devices."
38	if [[ $os == Linux ]]; then
39		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
40	fi
41	echo "reset             Rebind PCI devices back to their original drivers."
42	echo "                  Also cleanup any leftover spdk files/resources."
43	echo "                  Hugepage memory size will remain unchanged."
44	if [[ $os == Linux ]]; then
45		echo "status            Print status of all SPDK-compatible devices on the system."
46	fi
47	echo "help              Print this help message."
48	echo
49	echo "The following environment variables can be specified."
50	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
51	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
52	echo "                  default."
53	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
54	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
55	echo "                  Uses kernel's default for hugepages size."
56	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
57	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
58	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
59	echo "                  Hugepages can be defined per node with e.g.:"
60	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
61	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
62	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
63	echo "                  setting is used."
64	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
65	echo "                  number of requested hugepages is lower from what's already"
66	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
67	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
68	echo "                  be made prior to allocation".
69	echo "PCI_ALLOWED"
70	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
71	echo "                  Each device must be specified as a full PCI address."
72	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
73	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
74	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
75	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
76	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
77	echo "                  will be bound."
78	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
79	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
80	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
81	echo "                  By default the current user will be used."
82	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
83	echo "                  bind devices to the given driver."
84	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
85	echo "PCI_BLOCK_SYNC_ON_RESET"
86	echo "                  If set in the environment, the attempt to wait for block devices associated"
87	echo "                  with given PCI device will be made upon reset"
88	exit 0
89}
90
91# In monolithic kernels the lsmod won't work. So
92# back that with a /sys/modules. We also check
93# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
94# contain needed info (like in Fedora-like OS).
95function check_for_driver() {
96	if [[ -z $1 ]]; then
97		return 0
98	fi
99
100	if lsmod | grep -q ${1//-/_}; then
101		return 1
102	fi
103
104	if [[ -d /sys/module/${1} || -d \
105		/sys/module/${1//-/_} || -d \
106		/sys/bus/pci/drivers/${1} || -d \
107		/sys/bus/pci/drivers/${1//-/_} ]]; then
108		return 2
109	fi
110	return 0
111}
112
113function check_for_driver_freebsd() {
114	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
115	local search_paths path driver
116	IFS=";" read -ra search_paths < <(kldconfig -rU)
117
118	for driver in contigmem.ko nic_uio.ko; do
119		for path in "${search_paths[@]}"; do
120			[[ -f $path/$driver ]] && continue 2
121		done
122		return 1
123	done
124	return 0
125}
126
127function pci_dev_echo() {
128	local bdf="$1"
129	shift
130	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
131}
132
133function linux_bind_driver() {
134	bdf="$1"
135	driver_name="$2"
136	old_driver_name=${drivers_d["$bdf"]:-no driver}
137
138	if [[ $driver_name == "$old_driver_name" ]]; then
139		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
140		return 0
141	fi
142
143	if [[ $old_driver_name != "no driver" ]]; then
144		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
145	fi
146
147	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
148
149	if [[ $driver_name == "none" ]]; then
150		return 0
151	fi
152
153	local probe_attempts=0
154	echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override"
155	while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do
156		pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)"
157		sleep 0.5
158	done 2> /dev/null
159
160	echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
161
162	if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then
163		pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting"
164		return 1
165	fi
166
167	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
168	if [ -e "/dev/vfio/$iommu_group" ]; then
169		if [ -n "$TARGET_USER" ]; then
170			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
171		fi
172	fi
173}
174
175function linux_unbind_driver() {
176	local bdf="$1"
177	local old_driver_name=${drivers_d["$bdf"]:-no driver}
178
179	if [[ $old_driver_name == "no driver" ]]; then
180		pci_dev_echo "$bdf" "Not bound to any driver"
181		return 0
182	fi
183
184	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
185		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
186		echo "" > "/sys/bus/pci/devices/$bdf/driver_override"
187	fi
188
189	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
190}
191
192function linux_hugetlbfs_mounts() {
193	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
194}
195
196function get_block_dev_from_bdf() {
197	local bdf=$1
198	local block blocks=() ctrl
199
200	for block in /sys/block/*; do
201		if [[ $block == *nvme* ]]; then
202			ctrl=${block##*/} ctrl=${ctrl%n*}
203			if [[ -e /sys/class/nvme/$ctrl && $(< "/sys/class/nvme/$ctrl/address") == "$bdf" ]]; then
204				blocks+=("${block##*/}")
205			fi
206		elif [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
207			blocks+=("${block##*/}")
208		fi
209	done
210	printf '%s\n' "${blocks[@]}"
211}
212
213function get_used_bdf_block_devs() {
214	local bdf=$1
215	local blocks block blockp dev mount holder
216	local used
217
218	hash lsblk &> /dev/null || return 1
219	blocks=($(get_block_dev_from_bdf "$bdf"))
220
221	for block in "${blocks[@]}"; do
222		# Check if the device is hold by some other, regardless if it's mounted
223		# or not.
224		for holder in "/sys/class/block/$block"*/holders/*; do
225			[[ -e $holder ]] || continue
226			blockp=${holder%/holders*} blockp=${blockp##*/}
227			if [[ -e $holder/slaves/$blockp ]]; then
228				used+=("holder@$blockp:${holder##*/}")
229			fi
230		done
231		while read -r dev mount; do
232			if [[ -e $mount ]]; then
233				used+=("mount@$block:$dev")
234			fi
235		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
236		if ((${#used[@]} == 0)); then
237			# Make sure we check if there's any valid data present on the target device
238			# regardless if it's being actively used or not. This is mainly done to make
239			# sure we don't miss more complex setups like ZFS pools, etc.
240			if block_in_use "$block" > /dev/null; then
241				used+=("data@$block")
242			fi
243		fi
244	done
245
246	if ((${#used[@]} > 0)); then
247		printf '%s\n' "${used[@]}"
248	fi
249}
250
251function collect_devices() {
252	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
253
254	local ids dev_type dev_id bdf bdfs in_use driver
255
256	ids+="PCI_DEVICE_ID_INTEL_IOAT"
257	ids+="|PCI_DEVICE_ID_INTEL_DSA"
258	ids+="|PCI_DEVICE_ID_INTEL_IAA"
259	ids+="|PCI_DEVICE_ID_VIRTIO"
260	ids+="|PCI_DEVICE_ID_INTEL_VMD"
261	ids+="|SPDK_PCI_CLASS_NVME"
262
263	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
264
265	while read -r _ dev_type dev_id; do
266		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
267		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
268		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
269		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
270		for bdf in "${bdfs[@]}"; do
271			in_use=0
272			if [[ $1 != status ]]; then
273				if ! pci_can_use "$bdf"; then
274					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
275					in_use=1
276				fi
277				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
278					if ! verify_bdf_block_devs "$bdf"; then
279						in_use=1
280					fi
281				fi
282				if [[ $dev_type == vmd ]]; then
283					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
284						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
285						in_use=1
286					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
287						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
288							if [ "$mode" == "config" ]; then
289								cat <<- MESSAGE
290									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
291									which are attached to the kernel NVMe driver,the binding process may go faster
292									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
293									NVMe SSDs, and then run again to unbind the VMD devices."
294								MESSAGE
295							fi
296						fi
297					fi
298				fi
299			fi
300			eval "${dev_type}_d[$bdf]=$in_use"
301			all_devices_d["$bdf"]=$in_use
302			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
303				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
304				drivers_d["$bdf"]=${driver##*/}
305			fi
306		done
307	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
308}
309
310function collect_driver() {
311	local bdf=$1
312	local drivers driver
313
314	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
315		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
316		# Pick first entry in case multiple aliases are bound to a driver.
317		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
318		driver=${driver##*/}
319	else
320		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
321		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
322		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
323		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
324		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
325		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
326	fi 2> /dev/null
327	echo "$driver"
328}
329
330function verify_bdf_block_devs() {
331	local bdf=$1
332	local blknames
333	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
334
335	if ((${#blknames[@]} > 0)); then
336		local IFS=","
337		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
338		return 1
339	fi
340}
341
342function configure_linux_pci() {
343	local driver_path=""
344	driver_name=""
345	igb_uio_fallback=""
346
347	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
348		# igb_uio is a common driver to override with and it depends on uio.
349		modprobe uio || true
350		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
351			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
352		fi
353	fi
354
355	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
356		driver_name=none
357	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
358		driver_path="$DRIVER_OVERRIDE"
359		driver_name="${DRIVER_OVERRIDE##*/}"
360		# modprobe and the sysfs don't use the .ko suffix.
361		driver_name=${driver_name%.ko}
362		# path = name -> there is no path
363		if [[ "$driver_path" = "$driver_name" ]]; then
364			driver_path=""
365		fi
366	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
367	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
368	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
369		driver_name=vfio-pci
370		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
371		# should be done automatically by modprobe since this particular module should
372		# be a part of vfio-pci dependencies, however, on some distros, it seems that
373		# it's not the case. See #1689.
374		if modinfo vfio_iommu_type1 > /dev/null; then
375			modprobe vfio_iommu_type1
376		fi
377	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
378		driver_name=uio_pci_generic
379	elif [[ -e $igb_uio_fallback ]]; then
380		driver_path="$igb_uio_fallback"
381		driver_name="igb_uio"
382		echo "WARNING: uio_pci_generic not detected - using $driver_name"
383	else
384		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
385		return 1
386	fi
387
388	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
389	if [[ $driver_name != "none" ]]; then
390		if [[ -n "$driver_path" ]]; then
391			insmod $driver_path || true
392		else
393			modprobe $driver_name
394		fi
395	fi
396
397	for bdf in "${!all_devices_d[@]}"; do
398		if ((all_devices_d["$bdf"] == 0)); then
399			if [[ -n ${nvme_d["$bdf"]} ]]; then
400				# Some nvme controllers may take significant amount of time while being
401				# unbound from the driver. Put that task into background to speed up the
402				# whole process. Currently this is done only for the devices bound to the
403				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
404				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
405				linux_bind_driver "$bdf" "$driver_name" &
406			else
407				linux_bind_driver "$bdf" "$driver_name"
408			fi
409		fi
410	done
411	wait
412
413	echo "1" > "/sys/bus/pci/rescan"
414}
415
416function cleanup_linux() {
417	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
418	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
419
420	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
421	if [[ -d $XDG_RUNTIME_DIR ]]; then
422		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
423	fi
424
425	for dir in "${dirs_to_clean[@]}"; do
426		files_to_clean+=("$dir/"*)
427	done
428	file_locks+=(/var/tmp/spdk_pci_lock*)
429	file_locks+=(/var/tmp/spdk_cpu_lock*)
430
431	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*)
432	files_to_clean+=("${file_locks[@]}")
433
434	# This may fail in case path that readlink attempts to resolve suddenly
435	# disappears (as it may happen with terminating processes).
436	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true
437
438	if ((${#opened_files[@]} == 0)); then
439		echo "Can't get list of opened files!"
440		exit 1
441	fi
442
443	echo 'Cleaning'
444	for f in "${files_to_clean[@]}"; do
445		[[ -e $f ]] || continue
446		if [[ ${opened_files[*]} != *"$f"* ]]; then
447			echo "Removing:    $f"
448			rm $f
449		else
450			echo "Still open: $f"
451		fi
452	done
453
454	for dir in "${dirs_to_clean[@]}"; do
455		[[ -d $dir ]] || continue
456		if [[ ${opened_files[*]} != *"$dir"* ]]; then
457			echo "Removing:    $dir"
458			rmdir $dir
459		else
460			echo "Still open: $dir"
461		fi
462	done
463	echo "Clean"
464}
465
466check_hugepages_alloc() {
467	local hp_int=$1
468	local allocated_hugepages
469
470	allocated_hugepages=$(< "$hp_int")
471
472	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
473		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
474		return 0
475	fi
476
477	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
478
479	allocated_hugepages=$(< "$hp_int")
480	if ((allocated_hugepages < NRHUGE)); then
481		cat <<- ERROR
482
483			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
484			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
485		ERROR
486		return 1
487	fi
488}
489
490clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
491
492configure_linux_hugepages() {
493	local node system_nodes
494	local nodes_to_use nodes_hp
495
496	if [[ $CLEAR_HUGE == yes ]]; then
497		clear_hugepages
498	fi
499
500	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
501		clear_hugepages
502		check_hugepages_alloc /proc/sys/vm/nr_hugepages
503		return 0
504	fi
505
506	for node in /sys/devices/system/node/node*; do
507		[[ -e $node ]] || continue
508		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
509	done
510
511	if ((${#nodes[@]} == 0)); then
512		# No NUMA support? Fallback to common interface
513		check_hugepages_alloc /proc/sys/vm/nr_hugepages
514		return 0
515	fi
516
517	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
518	if ((${#nodes_to_use[@]} == 0)); then
519		nodes_to_use[0]=0
520	fi
521
522	# Align indexes with node ids
523	for node in "${!nodes_to_use[@]}"; do
524		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
525			eval "${nodes_to_use[node]}"
526		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
527			nodes_hp[nodes_to_use[node]]=$NRHUGE
528		fi
529	done
530
531	for node in "${!nodes_hp[@]}"; do
532		if [[ -z ${nodes[node]} ]]; then
533			echo "Node $node doesn't exist, ignoring" >&2
534			continue
535		fi
536		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
537	done
538}
539
540function configure_linux() {
541	configure_linux_pci
542	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
543
544	if [ -z "$hugetlbfs_mounts" ]; then
545		hugetlbfs_mounts=/mnt/huge
546		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
547		mkdir -p "$hugetlbfs_mounts"
548		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
549	fi
550
551	configure_linux_hugepages
552
553	if [ "$driver_name" = "vfio-pci" ]; then
554		if [ -n "$TARGET_USER" ]; then
555			for mount in $hugetlbfs_mounts; do
556				chown "$TARGET_USER" "$mount"
557				chmod g+w "$mount"
558			done
559
560			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
561			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
562				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
563				cat <<- MEMLOCK
564					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
565
566					This is the maximum amount of memory you will be
567					able to use with DPDK and VFIO if run as user "$TARGET_USER".
568					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
569				MEMLOCK
570				if ((MEMLOCK_AMNT < 65536)); then
571					echo ""
572					echo "## WARNING: memlock limit is less than 64MB"
573					echo -n "## DPDK with VFIO may not be able to initialize "
574					echo "if run as user \"$TARGET_USER\"."
575				fi
576			fi
577		fi
578	fi
579
580	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
581		# Some distros build msr as a module.  Make sure it's loaded to ensure
582		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
583		#  sleeps.
584		modprobe msr &> /dev/null || true
585	fi
586}
587
588function reset_linux_pci() {
589	# virtio
590	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
591	# Requires some more investigation - for example, some kernels do not seem to have
592	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
593	#  underscore vs. dash right in the virtio_scsi name.
594	modprobe virtio-pci || true
595	for bdf in "${!all_devices_d[@]}"; do
596		((all_devices_d["$bdf"] == 0)) || continue
597
598		driver=$(collect_driver "$bdf")
599		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
600			linux_bind_driver "$bdf" "$driver"
601		else
602			linux_unbind_driver "$bdf"
603		fi
604	done
605
606	echo "1" > "/sys/bus/pci/rescan"
607}
608
609function reset_linux() {
610	reset_linux_pci
611	for mount in $(linux_hugetlbfs_mounts); do
612		for hp in "$mount"/spdk*map_*; do
613			flock -n "$hp" true && rm -f "$hp"
614		done
615	done
616	rm -f /run/.spdk*
617}
618
619function status_linux() {
620	echo "Hugepages" >&2
621	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
622
623	numa_nodes=0
624	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
625		numa_nodes=$((numa_nodes + 1))
626		free_pages=$(cat $path/free_hugepages)
627		all_pages=$(cat $path/nr_hugepages)
628
629		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
630
631		node=${BASH_REMATCH[1]}
632		huge_size=${BASH_REMATCH[2]}
633
634		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
635	done
636
637	# fall back to system-wide hugepages
638	if [ "$numa_nodes" = "0" ]; then
639		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
640		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
641		node="-"
642		huge_size="$HUGEPGSZ"
643
644		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
645	fi
646
647	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
648		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
649
650	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
651
652	for bdf in "${sorted_bdfs[@]}"; do
653		driver=${drivers_d["$bdf"]}
654		if [ "$numa_nodes" = "0" ]; then
655			node="-"
656		else
657			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
658			if ((node == -1)); then
659				node=unknown
660			fi
661		fi
662		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
663			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
664		else
665			name="-"
666		fi
667
668		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
669			blknames=($(get_block_dev_from_bdf "$bdf"))
670		else
671			blknames=("-")
672		fi
673
674		desc=""
675		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
676		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
677		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
678		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
679		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
680		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
681
682		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
683			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
684			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
685	done
686}
687
688function status_freebsd() {
689	local pci
690
691	status_print() (
692		local type=$1
693		local dev driver
694
695		shift
696
697		for pci; do
698			printf '%-8s %-15s %-6s %-6s %-16s\n' \
699				"$type" \
700				"$pci" \
701				"${pci_ids_vendor["$pci"]}" \
702				"${pci_ids_device["$pci"]}" \
703				"${pci_bus_driver["$pci"]}"
704		done | sort -k2,2
705	)
706
707	local contigmem=present
708	local contigmem_buffer_size
709	local contigmem_num_buffers
710
711	if ! kldstat -q -m contigmem; then
712		contigmem="not present"
713	fi
714	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
715		contigmem_buffer_size="not set"
716	fi
717	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
718		contigmem_num_buffers="not set"
719	fi
720
721	cat <<- BSD_INFO
722		Contigmem ($contigmem)
723		Buffer Size: $contigmem_buffer_size
724		Num Buffers: $contigmem_num_buffers
725
726	BSD_INFO
727
728	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
729		"Type" "BDF" "Vendor" "Device" "Driver" >&2
730
731	status_print "NVMe" "${!nvme_d[@]}"
732	status_print "I/OAT" "${!ioat_d[@]}"
733	status_print "DSA" "${!dsa_d[@]}"
734	status_print "IAA" "${!iaa_d[@]}"
735	status_print "VMD" "${!vmd_d[@]}"
736}
737
738function configure_freebsd_pci() {
739	local BDFS
740
741	BDFS+=("${!nvme_d[@]}")
742	BDFS+=("${!ioat_d[@]}")
743	BDFS+=("${!dsa_d[@]}")
744	BDFS+=("${!iaa_d[@]}")
745	BDFS+=("${!vmd_d[@]}")
746
747	# Drop the domain part from all the addresses
748	BDFS=("${BDFS[@]#*:}")
749
750	local IFS=","
751	kldunload nic_uio.ko || true
752	kenv hw.nic_uio.bdfs="${BDFS[*]}"
753	kldload nic_uio.ko
754}
755
756function configure_freebsd() {
757	if ! check_for_driver_freebsd; then
758		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
759		return 1
760	fi
761	configure_freebsd_pci
762	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
763	#  previous value, unload contigmem so that we can reload with the new value.
764	if kldstat -q -m contigmem; then
765		# contigmem may be loaded, but the kernel environment doesn't have to
766		# be necessarily set at this point. If it isn't, kenv will fail to
767		# pick up the hw. options. Handle it.
768		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
769			contigmem_num_buffers=-1
770		fi 2> /dev/null
771		if ((contigmem_num_buffers != HUGEMEM / 256)); then
772			kldunload contigmem.ko
773		fi
774	fi
775	if ! kldstat -q -m contigmem; then
776		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
777		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
778		kldload contigmem.ko
779	fi
780}
781
782function reset_freebsd() {
783	kldunload contigmem.ko || true
784	kldunload nic_uio.ko || true
785}
786
787CMD=reset cache_pci_bus
788
789mode=$1
790
791if [ -z "$mode" ]; then
792	mode="config"
793fi
794
795: ${HUGEMEM:=2048}
796: ${PCI_ALLOWED:=""}
797: ${PCI_BLOCKED:=""}
798
799if [ -n "$NVME_ALLOWED" ]; then
800	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
801fi
802
803if [ -n "$SKIP_PCI" ]; then
804	PCI_ALLOWED="none"
805fi
806
807if [ -z "$TARGET_USER" ]; then
808	TARGET_USER="$SUDO_USER"
809	if [ -z "$TARGET_USER" ]; then
810		TARGET_USER=$(logname 2> /dev/null) || true
811	fi
812fi
813
814collect_devices "$mode"
815
816if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
817	# Note that this will wait only for the first block device attached to
818	# a given storage controller. For nvme this may miss some of the devs
819	# in case multiple namespaces are being in place.
820	# FIXME: Wait for nvme controller(s) to be in live state and determine
821	# number of configured namespaces, build list of potential block devs
822	# and pass them to sync_dev_uevents. Is it worth the effort?
823	bdfs_to_wait_for=()
824	for bdf in "${!all_devices_d[@]}"; do
825		((all_devices_d["$bdf"] == 0)) || continue
826		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
827			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
828			bdfs_to_wait_for+=("$bdf")
829		fi
830	done
831	if ((${#bdfs_to_wait_for[@]} > 0)); then
832		echo "Waiting for block devices as requested"
833		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
834		"$rootdir/scripts/sync_dev_uevents.sh" \
835			block/disk \
836			"${bdfs_to_wait_for[@]}" &
837		sync_pid=$!
838	fi
839fi
840
841if [[ $os == Linux ]]; then
842	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
843		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
844		unset -v HUGEPGSZ
845	fi
846
847	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
848	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
849	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
850
851	if [ "$mode" == "config" ]; then
852		configure_linux
853	elif [ "$mode" == "cleanup" ]; then
854		cleanup_linux
855		clear_hugepages
856	elif [ "$mode" == "reset" ]; then
857		reset_linux
858	elif [ "$mode" == "status" ]; then
859		status_linux
860	elif [ "$mode" == "help" ]; then
861		usage $0
862	else
863		usage $0 "Invalid argument '$mode'"
864	fi
865else
866	if [ "$mode" == "config" ]; then
867		configure_freebsd
868	elif [ "$mode" == "reset" ]; then
869		reset_freebsd
870	elif [ "$mode" == "cleanup" ]; then
871		echo "setup.sh cleanup function not yet supported on $os"
872	elif [ "$mode" == "status" ]; then
873		status_freebsd
874	elif [ "$mode" == "help" ]; then
875		usage $0
876	else
877		usage $0 "Invalid argument '$mode'"
878	fi
879fi
880
881if [[ -e /proc/$sync_pid/status ]]; then
882	wait "$sync_pid"
883fi
884