xref: /spdk/scripts/setup.sh (revision 7ba33f49f03129ea74fc61a2e5cd1ed865292fda)
1#!/usr/bin/env bash
2
3set -e
4shopt -s nullglob extglob
5
6os=$(uname -s)
7
8if [[ $os != Linux && $os != FreeBSD ]]; then
9	echo "Not supported platform ($os), aborting"
10	exit 1
11fi
12
13rootdir=$(readlink -f $(dirname $0))/..
14source "$rootdir/scripts/common.sh"
15
16function usage() {
17	if [[ $os == Linux ]]; then
18		options="[config|reset|status|cleanup|help]"
19	else
20		options="[config|reset|help]"
21	fi
22
23	[[ -n $2 ]] && (
24		echo "$2"
25		echo ""
26	)
27	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
28	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
29	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
30	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
31	echo "Usage: $(basename $1) $options"
32	echo
33	echo "$options - as following:"
34	echo "config            Default mode. Allocate hugepages and bind PCI devices."
35	if [[ $os == Linux ]]; then
36		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
37	fi
38	echo "reset             Rebind PCI devices back to their original drivers."
39	echo "                  Also cleanup any leftover spdk files/resources."
40	echo "                  Hugepage memory size will remain unchanged."
41	if [[ $os == Linux ]]; then
42		echo "status            Print status of all SPDK-compatible devices on the system."
43	fi
44	echo "help              Print this help message."
45	echo
46	echo "The following environment variables can be specified."
47	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
48	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
49	echo "                  default."
50	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
51	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
52	echo "                  Uses kernel's default for hugepages size."
53	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
54	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
55	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
56	echo "                  Hugepages can be defined per node with e.g.:"
57	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
58	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
59	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
60	echo "                  setting is used."
61	echo "SHRINK_HUGE       If set to 'yes', hugepages allocation won't be skipped in case"
62	echo "                  number of requested hugepages is lower from what's already"
63	echo "                  allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use."
64	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
65	echo "                  be made prior to allocation".
66	echo "PCI_ALLOWED"
67	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
68	echo "                  Each device must be specified as a full PCI address."
69	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
70	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
71	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
72	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
73	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
74	echo "                  will be bound."
75	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
76	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
77	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
78	echo "                  By default the current user will be used."
79	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
80	echo "                  bind devices to the given driver."
81	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
82	echo "PCI_BLOCK_SYNC_ON_RESET"
83	echo "                  If set in the environment, the attempt to wait for block devices associated"
84	echo "                  with given PCI device will be made upon reset"
85	exit 0
86}
87
88# In monolithic kernels the lsmod won't work. So
89# back that with a /sys/modules. We also check
90# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
91# contain needed info (like in Fedora-like OS).
92function check_for_driver() {
93	if [[ -z $1 ]]; then
94		return 0
95	fi
96
97	if lsmod | grep -q ${1//-/_}; then
98		return 1
99	fi
100
101	if [[ -d /sys/module/${1} || -d \
102		/sys/module/${1//-/_} || -d \
103		/sys/bus/pci/drivers/${1} || -d \
104		/sys/bus/pci/drivers/${1//-/_} ]]; then
105		return 2
106	fi
107	return 0
108}
109
110function check_for_driver_freebsd() {
111	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
112	local search_paths path driver
113	IFS=";" read -ra search_paths < <(kldconfig -rU)
114
115	for driver in contigmem.ko nic_uio.ko; do
116		for path in "${search_paths[@]}"; do
117			[[ -f $path/$driver ]] && continue 2
118		done
119		return 1
120	done
121	return 0
122}
123
124function pci_dev_echo() {
125	local bdf="$1"
126	shift
127	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
128}
129
130function linux_bind_driver() {
131	bdf="$1"
132	driver_name="$2"
133	old_driver_name=${drivers_d["$bdf"]:-no driver}
134	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
135
136	if [[ $driver_name == "$old_driver_name" ]]; then
137		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
138		return 0
139	fi
140
141	if [[ $old_driver_name != "no driver" ]]; then
142		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
143		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
144	fi
145
146	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
147
148	if [[ $driver_name == "none" ]]; then
149		return 0
150	fi
151
152	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
153	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
154
155	if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then
156		# Check if the uio_pci_generic driver is broken as it might be in
157		# some 4.18.x kernels (see centos8 for instance) - if our device
158		# didn't get a proper uio entry, fallback to igb_uio
159		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
160			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
161			drivers_d["$bdf"]="no driver"
162			# This call will override $driver_name for remaining devices as well
163			linux_bind_driver "$bdf" igb_uio
164		fi
165	fi
166
167	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
168	if [ -e "/dev/vfio/$iommu_group" ]; then
169		if [ -n "$TARGET_USER" ]; then
170			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
171		fi
172	fi
173}
174
175function linux_unbind_driver() {
176	local bdf="$1"
177	local ven_dev_id
178	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
179	local old_driver_name=${drivers_d["$bdf"]:-no driver}
180
181	if [[ $old_driver_name == "no driver" ]]; then
182		pci_dev_echo "$bdf" "Not bound to any driver"
183		return 0
184	fi
185
186	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
187		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
188		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
189	fi
190
191	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
192}
193
194function linux_hugetlbfs_mounts() {
195	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
196}
197
198function get_block_dev_from_bdf() {
199	local bdf=$1
200	local block
201
202	for block in /sys/block/*; do
203		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
204			echo "${block##*/}"
205		fi
206	done
207}
208
209function get_used_bdf_block_devs() {
210	local bdf=$1
211	local blocks block blockp dev mount holder
212	local used
213
214	hash lsblk &> /dev/null || return 1
215	blocks=($(get_block_dev_from_bdf "$bdf"))
216
217	for block in "${blocks[@]}"; do
218		# Check if the device is hold by some other, regardless if it's mounted
219		# or not.
220		for holder in "/sys/class/block/$block"*/holders/*; do
221			[[ -e $holder ]] || continue
222			blockp=${holder%/holders*} blockp=${blockp##*/}
223			if [[ -e $holder/slaves/$blockp ]]; then
224				used+=("holder@$blockp:${holder##*/}")
225			fi
226		done
227		while read -r dev mount; do
228			if [[ -e $mount ]]; then
229				used+=("mount@$block:$dev")
230			fi
231		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
232		if ((${#used[@]} == 0)); then
233			# Make sure we check if there's any valid data present on the target device
234			# regardless if it's being actively used or not. This is mainly done to make
235			# sure we don't miss more complex setups like ZFS pools, etc.
236			if block_in_use "$block" > /dev/null; then
237				used+=("data@$block")
238			fi
239		fi
240	done
241
242	if ((${#used[@]} > 0)); then
243		printf '%s\n' "${used[@]}"
244	fi
245}
246
247function collect_devices() {
248	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
249
250	local ids dev_type dev_id bdf bdfs in_use driver
251
252	ids+="PCI_DEVICE_ID_INTEL_IOAT"
253	ids+="|PCI_DEVICE_ID_INTEL_DSA"
254	ids+="|PCI_DEVICE_ID_INTEL_IAA"
255	ids+="|PCI_DEVICE_ID_VIRTIO"
256	ids+="|PCI_DEVICE_ID_INTEL_VMD"
257	ids+="|SPDK_PCI_CLASS_NVME"
258
259	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
260
261	while read -r _ dev_type dev_id; do
262		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
263		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
264		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
265		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
266		for bdf in "${bdfs[@]}"; do
267			in_use=0
268			if [[ $1 != status ]]; then
269				if ! pci_can_use "$bdf"; then
270					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
271					in_use=1
272				fi
273				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
274					if ! verify_bdf_block_devs "$bdf"; then
275						in_use=1
276					fi
277				fi
278				if [[ $dev_type == vmd ]]; then
279					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
280						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
281						in_use=1
282					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
283						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
284							if [ "$mode" == "config" ]; then
285								cat <<- MESSAGE
286									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
287									which are attached to the kernel NVMe driver,the binding process may go faster
288									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
289									NVMe SSDs, and then run again to unbind the VMD devices."
290								MESSAGE
291							fi
292						fi
293					fi
294				fi
295			fi
296			eval "${dev_type}_d[$bdf]=$in_use"
297			all_devices_d["$bdf"]=$in_use
298			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
299				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
300				drivers_d["$bdf"]=${driver##*/}
301			fi
302		done
303	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
304}
305
306function collect_driver() {
307	local bdf=$1
308	local drivers driver
309
310	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
311		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
312		# Pick first entry in case multiple aliases are bound to a driver.
313		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
314		driver=${driver##*/}
315	else
316		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
317		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
318		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
319		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
320		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
321		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
322	fi 2> /dev/null
323	echo "$driver"
324}
325
326function verify_bdf_block_devs() {
327	local bdf=$1
328	local blknames
329	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
330
331	if ((${#blknames[@]} > 0)); then
332		local IFS=","
333		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
334		return 1
335	fi
336}
337
338function configure_linux_pci() {
339	local driver_path=""
340	driver_name=""
341	igb_uio_fallback=""
342
343	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
344		# igb_uio is a common driver to override with and it depends on uio.
345		modprobe uio || true
346		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
347			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
348		fi
349	fi
350
351	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
352		driver_name=none
353	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
354		driver_path="$DRIVER_OVERRIDE"
355		driver_name="${DRIVER_OVERRIDE##*/}"
356		# modprobe and the sysfs don't use the .ko suffix.
357		driver_name=${driver_name%.ko}
358		# path = name -> there is no path
359		if [[ "$driver_path" = "$driver_name" ]]; then
360			driver_path=""
361		fi
362	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
363	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
364	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
365		driver_name=vfio-pci
366		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
367		# should be done automatically by modprobe since this particular module should
368		# be a part of vfio-pci dependencies, however, on some distros, it seems that
369		# it's not the case. See #1689.
370		if modinfo vfio_iommu_type1 > /dev/null; then
371			modprobe vfio_iommu_type1
372		fi
373	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
374		driver_name=uio_pci_generic
375	elif [[ -e $igb_uio_fallback ]]; then
376		driver_path="$igb_uio_fallback"
377		driver_name="igb_uio"
378		echo "WARNING: uio_pci_generic not detected - using $driver_name"
379	else
380		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
381		return 1
382	fi
383
384	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
385	if [[ $driver_name != "none" ]]; then
386		if [[ -n "$driver_path" ]]; then
387			insmod $driver_path || true
388		else
389			modprobe $driver_name
390		fi
391	fi
392
393	for bdf in "${!all_devices_d[@]}"; do
394		if ((all_devices_d["$bdf"] == 0)); then
395			if [[ -n ${nvme_d["$bdf"]} ]]; then
396				# Some nvme controllers may take significant amount of time while being
397				# unbound from the driver. Put that task into background to speed up the
398				# whole process. Currently this is done only for the devices bound to the
399				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
400				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
401				linux_bind_driver "$bdf" "$driver_name" &
402			else
403				linux_bind_driver "$bdf" "$driver_name"
404			fi
405		fi
406	done
407	wait
408
409	echo "1" > "/sys/bus/pci/rescan"
410}
411
412function cleanup_linux() {
413	local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=()
414	local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc"
415
416	dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9]))
417	if [[ -d $XDG_RUNTIME_DIR ]]; then
418		dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9]))
419	fi
420
421	for dir in "${dirs_to_clean[@]}"; do
422		files_to_clean+=("$dir/"*)
423	done
424	file_locks+=(/var/tmp/spdk_pci_lock*)
425
426	files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns))
427	files_to_clean+=("${file_locks[@]}")
428	if ((${#files_to_clean[@]} == 0)); then
429		echo "Clean"
430		return 0
431	fi
432
433	opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9])))
434
435	if ((${#opened_files[@]} == 0)); then
436		echo "Can't get list of opened files!"
437		exit 1
438	fi
439
440	echo 'Cleaning'
441	for f in "${files_to_clean[@]}"; do
442		[[ -e $f ]] || continue
443		if [[ ${opened_files[*]} != *"$f"* ]]; then
444			echo "Removing:    $f"
445			rm $f
446		else
447			echo "Still open: $f"
448		fi
449	done
450
451	for dir in "${dirs_to_clean[@]}"; do
452		[[ -d $dir ]] || continue
453		if [[ ${opened_files[*]} != *"$dir"* ]]; then
454			echo "Removing:    $dir"
455			rmdir $dir
456		else
457			echo "Still open: $dir"
458		fi
459	done
460	echo "Clean"
461}
462
463check_hugepages_alloc() {
464	local hp_int=$1
465	local allocated_hugepages
466
467	allocated_hugepages=$(< "$hp_int")
468
469	if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then
470		echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}"
471		return 0
472	fi
473
474	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
475
476	allocated_hugepages=$(< "$hp_int")
477	if ((allocated_hugepages < NRHUGE)); then
478		cat <<- ERROR
479
480			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
481			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
482		ERROR
483		return 1
484	fi
485}
486
487clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
488
489configure_linux_hugepages() {
490	local node system_nodes
491	local nodes_to_use nodes_hp
492
493	if [[ $CLEAR_HUGE == yes ]]; then
494		clear_hugepages
495	fi
496
497	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
498		clear_hugepages
499		check_hugepages_alloc /proc/sys/vm/nr_hugepages
500		return 0
501	fi
502
503	for node in /sys/devices/system/node/node*; do
504		[[ -e $node ]] || continue
505		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
506	done
507
508	if ((${#nodes[@]} == 0)); then
509		# No NUMA support? Fallback to common interface
510		check_hugepages_alloc /proc/sys/vm/nr_hugepages
511		return 0
512	fi
513
514	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
515	if ((${#nodes_to_use[@]} == 0)); then
516		nodes_to_use[0]=0
517	fi
518
519	# Align indexes with node ids
520	for node in "${!nodes_to_use[@]}"; do
521		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
522			eval "${nodes_to_use[node]}"
523		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
524			nodes_hp[nodes_to_use[node]]=$NRHUGE
525		fi
526	done
527
528	for node in "${!nodes_hp[@]}"; do
529		if [[ -z ${nodes[node]} ]]; then
530			echo "Node $node doesn't exist, ignoring" >&2
531			continue
532		fi
533		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
534	done
535}
536
537function configure_linux() {
538	configure_linux_pci
539	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
540
541	if [ -z "$hugetlbfs_mounts" ]; then
542		hugetlbfs_mounts=/mnt/huge
543		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
544		mkdir -p "$hugetlbfs_mounts"
545		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
546	fi
547
548	configure_linux_hugepages
549
550	if [ "$driver_name" = "vfio-pci" ]; then
551		if [ -n "$TARGET_USER" ]; then
552			for mount in $hugetlbfs_mounts; do
553				chown "$TARGET_USER" "$mount"
554				chmod g+w "$mount"
555			done
556
557			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
558			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
559				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
560				cat <<- MEMLOCK
561					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
562
563					This is the maximum amount of memory you will be
564					able to use with DPDK and VFIO if run as user "$TARGET_USER".
565					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
566				MEMLOCK
567				if ((MEMLOCK_AMNT < 65536)); then
568					echo ""
569					echo "## WARNING: memlock limit is less than 64MB"
570					echo -n "## DPDK with VFIO may not be able to initialize "
571					echo "if run as user \"$TARGET_USER\"."
572				fi
573			fi
574		fi
575	fi
576
577	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
578		# Some distros build msr as a module.  Make sure it's loaded to ensure
579		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
580		#  sleeps.
581		modprobe msr &> /dev/null || true
582	fi
583}
584
585function reset_linux_pci() {
586	# virtio
587	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
588	# Requires some more investigation - for example, some kernels do not seem to have
589	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
590	#  underscore vs. dash right in the virtio_scsi name.
591	modprobe virtio-pci || true
592	for bdf in "${!all_devices_d[@]}"; do
593		((all_devices_d["$bdf"] == 0)) || continue
594
595		driver=$(collect_driver "$bdf")
596		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
597			linux_bind_driver "$bdf" "$driver"
598		else
599			linux_unbind_driver "$bdf"
600		fi
601	done
602
603	echo "1" > "/sys/bus/pci/rescan"
604}
605
606function reset_linux() {
607	reset_linux_pci
608	for mount in $(linux_hugetlbfs_mounts); do
609		for hp in "$mount"/spdk*map_*; do
610			flock -n "$hp" true && rm -f "$hp"
611		done
612	done
613	rm -f /run/.spdk*
614}
615
616function status_linux() {
617	echo "Hugepages" >&2
618	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
619
620	numa_nodes=0
621	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
622		numa_nodes=$((numa_nodes + 1))
623		free_pages=$(cat $path/free_hugepages)
624		all_pages=$(cat $path/nr_hugepages)
625
626		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
627
628		node=${BASH_REMATCH[1]}
629		huge_size=${BASH_REMATCH[2]}
630
631		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
632	done
633
634	# fall back to system-wide hugepages
635	if [ "$numa_nodes" = "0" ]; then
636		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
637		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
638		node="-"
639		huge_size="$HUGEPGSZ"
640
641		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
642	fi
643
644	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
645		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
646
647	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
648
649	for bdf in "${sorted_bdfs[@]}"; do
650		driver=${drivers_d["$bdf"]}
651		if [ "$numa_nodes" = "0" ]; then
652			node="-"
653		else
654			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
655			if ((node == -1)); then
656				node=unknown
657			fi
658		fi
659		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
660			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
661		else
662			name="-"
663		fi
664
665		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
666			blknames=($(get_block_dev_from_bdf "$bdf"))
667		else
668			blknames=("-")
669		fi
670
671		desc=""
672		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
673		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
674		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
675		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
676		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
677		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
678
679		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
680			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
681			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
682	done
683}
684
685function status_freebsd() {
686	local pci
687
688	status_print() (
689		local type=$1
690		local dev driver
691
692		shift
693
694		for pci; do
695			printf '%-8s %-15s %-6s %-6s %-16s\n' \
696				"$type" \
697				"$pci" \
698				"${pci_ids_vendor["$pci"]}" \
699				"${pci_ids_device["$pci"]}" \
700				"${pci_bus_driver["$pci"]}"
701		done | sort -k2,2
702	)
703
704	local contigmem=present
705	local contigmem_buffer_size
706	local contigmem_num_buffers
707
708	if ! kldstat -q -m contigmem; then
709		contigmem="not present"
710	fi
711	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
712		contigmem_buffer_size="not set"
713	fi
714	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
715		contigmem_num_buffers="not set"
716	fi
717
718	cat <<- BSD_INFO
719		Contigmem ($contigmem)
720		Buffer Size: $contigmem_buffer_size
721		Num Buffers: $contigmem_num_buffers
722
723	BSD_INFO
724
725	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
726		"Type" "BDF" "Vendor" "Device" "Driver" >&2
727
728	status_print "NVMe" "${!nvme_d[@]}"
729	status_print "I/OAT" "${!ioat_d[@]}"
730	status_print "DSA" "${!dsa_d[@]}"
731	status_print "IAA" "${!iaa_d[@]}"
732	status_print "VMD" "${!vmd_d[@]}"
733}
734
735function configure_freebsd_pci() {
736	local BDFS
737
738	BDFS+=("${!nvme_d[@]}")
739	BDFS+=("${!ioat_d[@]}")
740	BDFS+=("${!dsa_d[@]}")
741	BDFS+=("${!iaa_d[@]}")
742	BDFS+=("${!vmd_d[@]}")
743
744	# Drop the domain part from all the addresses
745	BDFS=("${BDFS[@]#*:}")
746
747	local IFS=","
748	kldunload nic_uio.ko || true
749	kenv hw.nic_uio.bdfs="${BDFS[*]}"
750	kldload nic_uio.ko
751}
752
753function configure_freebsd() {
754	if ! check_for_driver_freebsd; then
755		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
756		return 1
757	fi
758	configure_freebsd_pci
759	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
760	#  previous value, unload contigmem so that we can reload with the new value.
761	if kldstat -q -m contigmem; then
762		# contigmem may be loaded, but the kernel environment doesn't have to
763		# be necessarily set at this point. If it isn't, kenv will fail to
764		# pick up the hw. options. Handle it.
765		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
766			contigmem_num_buffers=-1
767		fi 2> /dev/null
768		if ((contigmem_num_buffers != HUGEMEM / 256)); then
769			kldunload contigmem.ko
770		fi
771	fi
772	if ! kldstat -q -m contigmem; then
773		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
774		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
775		kldload contigmem.ko
776	fi
777}
778
779function reset_freebsd() {
780	kldunload contigmem.ko || true
781	kldunload nic_uio.ko || true
782}
783
784CMD=reset cache_pci_bus
785
786mode=$1
787
788if [ -z "$mode" ]; then
789	mode="config"
790fi
791
792: ${HUGEMEM:=2048}
793: ${PCI_ALLOWED:=""}
794: ${PCI_BLOCKED:=""}
795
796if [ -n "$NVME_ALLOWED" ]; then
797	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
798fi
799
800if [ -n "$SKIP_PCI" ]; then
801	PCI_ALLOWED="none"
802fi
803
804if [ -z "$TARGET_USER" ]; then
805	TARGET_USER="$SUDO_USER"
806	if [ -z "$TARGET_USER" ]; then
807		TARGET_USER=$(logname 2> /dev/null) || true
808	fi
809fi
810
811collect_devices "$mode"
812
813if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
814	# Note that this will wait only for the first block device attached to
815	# a given storage controller. For nvme this may miss some of the devs
816	# in case multiple namespaces are being in place.
817	# FIXME: Wait for nvme controller(s) to be in live state and determine
818	# number of configured namespaces, build list of potential block devs
819	# and pass them to sync_dev_uevents. Is it worth the effort?
820	bdfs_to_wait_for=()
821	for bdf in "${!all_devices_d[@]}"; do
822		((all_devices_d["$bdf"] == 0)) || continue
823		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
824			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
825			bdfs_to_wait_for+=("$bdf")
826		fi
827	done
828	if ((${#bdfs_to_wait_for[@]} > 0)); then
829		echo "Waiting for block devices as requested"
830		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
831		"$rootdir/scripts/sync_dev_uevents.sh" \
832			block/disk \
833			"${bdfs_to_wait_for[@]}" &
834		sync_pid=$!
835	fi
836fi
837
838if [[ $os == Linux ]]; then
839	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
840		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
841		unset -v HUGEPGSZ
842	fi
843
844	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
845	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
846	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
847
848	if [ "$mode" == "config" ]; then
849		configure_linux
850	elif [ "$mode" == "cleanup" ]; then
851		cleanup_linux
852		clear_hugepages
853	elif [ "$mode" == "reset" ]; then
854		reset_linux
855	elif [ "$mode" == "status" ]; then
856		status_linux
857	elif [ "$mode" == "help" ]; then
858		usage $0
859	else
860		usage $0 "Invalid argument '$mode'"
861	fi
862else
863	if [ "$mode" == "config" ]; then
864		configure_freebsd
865	elif [ "$mode" == "reset" ]; then
866		reset_freebsd
867	elif [ "$mode" == "cleanup" ]; then
868		echo "setup.sh cleanup function not yet supported on $os"
869	elif [ "$mode" == "status" ]; then
870		status_freebsd
871	elif [ "$mode" == "help" ]; then
872		usage $0
873	else
874		usage $0 "Invalid argument '$mode'"
875	fi
876fi
877
878if [[ -e /proc/$sync_pid/status ]]; then
879	wait "$sync_pid"
880fi
881