xref: /spdk/scripts/setup.sh (revision 1f3a6b0398dfba2d9aedea1d272e64e55d6f1af6)
1#!/usr/bin/env bash
2
3set -e
4
5os=$(uname -s)
6
7if [[ $os != Linux && $os != FreeBSD ]]; then
8	echo "Not supported platform ($os), aborting"
9	exit 1
10fi
11
12rootdir=$(readlink -f $(dirname $0))/..
13source "$rootdir/scripts/common.sh"
14
15function usage() {
16	if [[ $os == Linux ]]; then
17		options="[config|reset|status|cleanup|help]"
18	else
19		options="[config|reset|help]"
20	fi
21
22	[[ -n $2 ]] && (
23		echo "$2"
24		echo ""
25	)
26	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
27	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
28	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
29	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
30	echo "Usage: $(basename $1) $options"
31	echo
32	echo "$options - as following:"
33	echo "config            Default mode. Allocate hugepages and bind PCI devices."
34	if [[ $os == Linux ]]; then
35		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
36	fi
37	echo "reset             Rebind PCI devices back to their original drivers."
38	echo "                  Also cleanup any leftover spdk files/resources."
39	echo "                  Hugepage memory size will remain unchanged."
40	if [[ $os == Linux ]]; then
41		echo "status            Print status of all SPDK-compatible devices on the system."
42	fi
43	echo "help              Print this help message."
44	echo
45	echo "The following environment variables can be specified."
46	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
47	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
48	echo "                  default."
49	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
50	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
51	echo "                  Uses kernel's default for hugepages size."
52	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
53	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
54	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
55	echo "                  Hugepages can be defined per node with e.g.:"
56	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
57	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
58	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
59	echo "                  setting is used."
60	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
61	echo "                  be made prior to allocation".
62	echo "PCI_ALLOWED"
63	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
64	echo "                  Each device must be specified as a full PCI address."
65	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
66	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
67	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
68	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
69	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
70	echo "                  will be bound."
71	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
72	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
73	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
74	echo "                  By default the current user will be used."
75	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
76	echo "                  bind devices to the given driver."
77	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
78	echo "PCI_BLOCK_SYNC_ON_RESET"
79	echo "                  If set in the environment, the attempt to wait for block devices associated"
80	echo "                  with given PCI device will be made upon reset"
81	exit 0
82}
83
84# In monolithic kernels the lsmod won't work. So
85# back that with a /sys/modules. We also check
86# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
87# contain needed info (like in Fedora-like OS).
88function check_for_driver() {
89	if [[ -z $1 ]]; then
90		return 0
91	fi
92
93	if lsmod | grep -q ${1//-/_}; then
94		return 1
95	fi
96
97	if [[ -d /sys/module/${1} || -d \
98		/sys/module/${1//-/_} || -d \
99		/sys/bus/pci/drivers/${1} || -d \
100		/sys/bus/pci/drivers/${1//-/_} ]]; then
101		return 2
102	fi
103	return 0
104}
105
106function check_for_driver_freebsd() {
107	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
108	local search_paths path driver
109	IFS=";" read -ra search_paths < <(kldconfig -rU)
110
111	for driver in contigmem.ko nic_uio.ko; do
112		for path in "${search_paths[@]}"; do
113			[[ -f $path/$driver ]] && continue 2
114		done
115		return 1
116	done
117	return 0
118}
119
120function pci_dev_echo() {
121	local bdf="$1"
122	shift
123	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
124}
125
126function linux_bind_driver() {
127	bdf="$1"
128	driver_name="$2"
129	old_driver_name=${drivers_d["$bdf"]:-no driver}
130	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
131
132	if [[ $driver_name == "$old_driver_name" ]]; then
133		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
134		return 0
135	fi
136
137	if [[ $old_driver_name != "no driver" ]]; then
138		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
139		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
140	fi
141
142	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
143
144	if [[ $driver_name == "none" ]]; then
145		return 0
146	fi
147
148	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
149	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
150
151	if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then
152		# Check if the uio_pci_generic driver is broken as it might be in
153		# some 4.18.x kernels (see centos8 for instance) - if our device
154		# didn't get a proper uio entry, fallback to igb_uio
155		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
156			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
157			drivers_d["$bdf"]="no driver"
158			# This call will override $driver_name for remaining devices as well
159			linux_bind_driver "$bdf" igb_uio
160		fi
161	fi
162
163	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
164	if [ -e "/dev/vfio/$iommu_group" ]; then
165		if [ -n "$TARGET_USER" ]; then
166			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
167		fi
168	fi
169}
170
171function linux_unbind_driver() {
172	local bdf="$1"
173	local ven_dev_id
174	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
175	local old_driver_name=${drivers_d["$bdf"]:-no driver}
176
177	if [[ $old_driver_name == "no driver" ]]; then
178		pci_dev_echo "$bdf" "Not bound to any driver"
179		return 0
180	fi
181
182	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
183		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
184		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
185	fi
186
187	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
188}
189
190function linux_hugetlbfs_mounts() {
191	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
192}
193
194function get_block_dev_from_bdf() {
195	local bdf=$1
196	local block
197
198	for block in /sys/block/*; do
199		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
200			echo "${block##*/}"
201		fi
202	done
203}
204
205function get_used_bdf_block_devs() {
206	local bdf=$1
207	local blocks block blockp dev mount holder
208	local used
209
210	hash lsblk &> /dev/null || return 1
211	blocks=($(get_block_dev_from_bdf "$bdf"))
212
213	for block in "${blocks[@]}"; do
214		# Check if the device is hold by some other, regardless if it's mounted
215		# or not.
216		for holder in "/sys/class/block/$block"*/holders/*; do
217			[[ -e $holder ]] || continue
218			blockp=${holder%/holders*} blockp=${blockp##*/}
219			if [[ -e $holder/slaves/$blockp ]]; then
220				used+=("holder@$blockp:${holder##*/}")
221			fi
222		done
223		while read -r dev mount; do
224			if [[ -e $mount ]]; then
225				used+=("mount@$block:$dev")
226			fi
227		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
228		if ((${#used[@]} == 0)); then
229			# Make sure we check if there's any valid data present on the target device
230			# regardless if it's being actively used or not. This is mainly done to make
231			# sure we don't miss more complex setups like ZFS pools, etc.
232			if block_in_use "$block" > /dev/null; then
233				used+=("data@$block")
234			fi
235		fi
236	done
237
238	if ((${#used[@]} > 0)); then
239		printf '%s\n' "${used[@]}"
240	fi
241}
242
243function collect_devices() {
244	# NVMe, IOAT, DSA, IAA, VIRTIO, VMD
245
246	local ids dev_type dev_id bdf bdfs in_use driver
247
248	ids+="PCI_DEVICE_ID_INTEL_IOAT"
249	ids+="|PCI_DEVICE_ID_INTEL_DSA"
250	ids+="|PCI_DEVICE_ID_INTEL_IAA"
251	ids+="|PCI_DEVICE_ID_VIRTIO"
252	ids+="|PCI_DEVICE_ID_INTEL_VMD"
253	ids+="|SPDK_PCI_CLASS_NVME"
254
255	local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d
256
257	while read -r _ dev_type dev_id; do
258		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
259		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
260		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
261		[[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
262		for bdf in "${bdfs[@]}"; do
263			in_use=0
264			if [[ $1 != status ]]; then
265				if ! pci_can_use "$bdf"; then
266					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
267					in_use=1
268				fi
269				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
270					if ! verify_bdf_block_devs "$bdf"; then
271						in_use=1
272					fi
273				fi
274				if [[ $dev_type == vmd ]]; then
275					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
276						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
277						in_use=1
278					elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then
279						if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then
280							if [ "$mode" == "config" ]; then
281								cat <<- MESSAGE
282									Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint
283									which are attached to the kernel NVMe driver,the binding process may go faster
284									if you first run this script with DRIVER_OVERRIDE="none" to unbind only the
285									NVMe SSDs, and then run again to unbind the VMD devices."
286								MESSAGE
287							fi
288						fi
289					fi
290				fi
291			fi
292			eval "${dev_type}_d[$bdf]=$in_use"
293			all_devices_d["$bdf"]=$in_use
294			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
295				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
296				drivers_d["$bdf"]=${driver##*/}
297			fi
298		done
299	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
300}
301
302function collect_driver() {
303	local bdf=$1
304	local drivers driver
305
306	if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \
307		&& drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
308		# Pick first entry in case multiple aliases are bound to a driver.
309		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
310		driver=${driver##*/}
311	else
312		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
313		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
314		[[ -n ${dsa_d["$bdf"]} ]] && driver=dsa
315		[[ -n ${iaa_d["$bdf"]} ]] && driver=iaa
316		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
317		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
318	fi 2> /dev/null
319	echo "$driver"
320}
321
322function verify_bdf_block_devs() {
323	local bdf=$1
324	local blknames
325	blknames=($(get_used_bdf_block_devs "$bdf")) || return 1
326
327	if ((${#blknames[@]} > 0)); then
328		local IFS=","
329		pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev"
330		return 1
331	fi
332}
333
334function configure_linux_pci() {
335	local driver_path=""
336	driver_name=""
337	igb_uio_fallback=""
338
339	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
340		# igb_uio is a common driver to override with and it depends on uio.
341		modprobe uio || true
342		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
343			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
344		fi
345	fi
346
347	if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then
348		driver_name=none
349	elif [[ -n "${DRIVER_OVERRIDE}" ]]; then
350		driver_path="$DRIVER_OVERRIDE"
351		driver_name="${DRIVER_OVERRIDE##*/}"
352		# modprobe and the sysfs don't use the .ko suffix.
353		driver_name=${driver_name%.ko}
354		# path = name -> there is no path
355		if [[ "$driver_path" = "$driver_name" ]]; then
356			driver_path=""
357		fi
358	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
359	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
360	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
361		driver_name=vfio-pci
362		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
363		# should be done automatically by modprobe since this particular module should
364		# be a part of vfio-pci dependencies, however, on some distros, it seems that
365		# it's not the case. See #1689.
366		if modinfo vfio_iommu_type1 > /dev/null; then
367			modprobe vfio_iommu_type1
368		fi
369	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
370		driver_name=uio_pci_generic
371	elif [[ -e $igb_uio_fallback ]]; then
372		driver_path="$igb_uio_fallback"
373		driver_name="igb_uio"
374		echo "WARNING: uio_pci_generic not detected - using $driver_name"
375	else
376		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
377		return 1
378	fi
379
380	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
381	if [[ $driver_name != "none" ]]; then
382		if [[ -n "$driver_path" ]]; then
383			insmod $driver_path || true
384		else
385			modprobe $driver_name
386		fi
387	fi
388
389	for bdf in "${!all_devices_d[@]}"; do
390		if ((all_devices_d["$bdf"] == 0)); then
391			if [[ -n ${nvme_d["$bdf"]} ]]; then
392				# Some nvme controllers may take significant amount of time while being
393				# unbound from the driver. Put that task into background to speed up the
394				# whole process. Currently this is done only for the devices bound to the
395				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
396				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
397				linux_bind_driver "$bdf" "$driver_name" &
398			else
399				linux_bind_driver "$bdf" "$driver_name"
400			fi
401		fi
402	done
403	wait
404
405	echo "1" > "/sys/bus/pci/rescan"
406}
407
408function cleanup_linux() {
409	shopt -s extglob nullglob
410	dirs_to_clean=""
411	dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) "
412	if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then
413		dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) "
414	fi
415
416	files_to_clean="" file_locks=()
417	for dir in $dirs_to_clean; do
418		files_to_clean+="$(echo $dir/*) "
419	done
420	file_locks+=(/var/tmp/spdk_pci_lock*)
421	shopt -u extglob nullglob
422
423	files_to_clean+="$(ls -1 /dev/shm/* \
424		| grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc)_trace|spdk_iscsi_conns' || true) "
425	files_to_clean+=" ${file_locks[*]}"
426	files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)"
427	if [[ -z "$files_to_clean" ]]; then
428		echo "Clean"
429		return 0
430	fi
431
432	shopt -s extglob
433	for fd_dir in $(echo /proc/+([0-9])); do
434		opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)"
435	done
436	shopt -u extglob
437
438	if [[ -z "$opened_files" ]]; then
439		echo "Can't get list of opened files!"
440		exit 1
441	fi
442
443	echo 'Cleaning'
444	for f in $files_to_clean; do
445		if ! echo "$opened_files" | grep -E -q "^$f\$"; then
446			echo "Removing:    $f"
447			rm $f
448		else
449			echo "Still open: $f"
450		fi
451	done
452
453	for dir in $dirs_to_clean; do
454		if ! echo "$opened_files" | grep -E -q "^$dir\$"; then
455			echo "Removing:    $dir"
456			rmdir $dir
457		else
458			echo "Still open: $dir"
459		fi
460	done
461	echo "Clean"
462
463	unset dirs_to_clean files_to_clean opened_files
464}
465
466check_hugepages_alloc() {
467	local hp_int=$1
468	local allocated_hugepages
469
470	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
471
472	allocated_hugepages=$(< "$hp_int")
473	if ((allocated_hugepages < NRHUGE)); then
474		cat <<- ERROR
475
476			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
477			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
478		ERROR
479		return 1
480	fi
481}
482
483clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
484
485configure_linux_hugepages() {
486	local node system_nodes
487	local nodes_to_use nodes_hp
488
489	if [[ $CLEAR_HUGE == yes ]]; then
490		clear_hugepages
491	fi
492
493	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
494		clear_hugepages
495		check_hugepages_alloc /proc/sys/vm/nr_hugepages
496		return 0
497	fi
498
499	for node in /sys/devices/system/node/node*; do
500		[[ -e $node ]] || continue
501		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
502	done
503
504	if ((${#nodes[@]} == 0)); then
505		# No NUMA support? Fallback to common interface
506		check_hugepages_alloc /proc/sys/vm/nr_hugepages
507		return 0
508	fi
509
510	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
511	if ((${#nodes_to_use[@]} == 0)); then
512		nodes_to_use[0]=0
513	fi
514
515	# Align indexes with node ids
516	for node in "${!nodes_to_use[@]}"; do
517		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
518			eval "${nodes_to_use[node]}"
519		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
520			nodes_hp[nodes_to_use[node]]=$NRHUGE
521		fi
522	done
523
524	for node in "${!nodes_hp[@]}"; do
525		if [[ -z ${nodes[node]} ]]; then
526			echo "Node $node doesn't exist, ignoring" >&2
527			continue
528		fi
529		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
530	done
531}
532
533function configure_linux() {
534	configure_linux_pci
535	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
536
537	if [ -z "$hugetlbfs_mounts" ]; then
538		hugetlbfs_mounts=/mnt/huge
539		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
540		mkdir -p "$hugetlbfs_mounts"
541		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
542	fi
543
544	configure_linux_hugepages
545
546	if [ "$driver_name" = "vfio-pci" ]; then
547		if [ -n "$TARGET_USER" ]; then
548			for mount in $hugetlbfs_mounts; do
549				chown "$TARGET_USER" "$mount"
550				chmod g+w "$mount"
551			done
552
553			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
554			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
555				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
556				cat <<- MEMLOCK
557					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
558
559					This is the maximum amount of memory you will be
560					able to use with DPDK and VFIO if run as user "$TARGET_USER".
561					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
562				MEMLOCK
563				if ((MEMLOCK_AMNT < 65536)); then
564					echo ""
565					echo "## WARNING: memlock limit is less than 64MB"
566					echo -n "## DPDK with VFIO may not be able to initialize "
567					echo "if run as user \"$TARGET_USER\"."
568				fi
569			fi
570		fi
571	fi
572
573	if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then
574		# Some distros build msr as a module.  Make sure it's loaded to ensure
575		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
576		#  sleeps.
577		modprobe msr &> /dev/null || true
578	fi
579}
580
581function reset_linux_pci() {
582	# virtio
583	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
584	# Requires some more investigation - for example, some kernels do not seem to have
585	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
586	#  underscore vs. dash right in the virtio_scsi name.
587	modprobe virtio-pci || true
588	for bdf in "${!all_devices_d[@]}"; do
589		((all_devices_d["$bdf"] == 0)) || continue
590
591		driver=$(collect_driver "$bdf")
592		if [[ -n $driver ]] && ! check_for_driver "$driver"; then
593			linux_bind_driver "$bdf" "$driver"
594		else
595			linux_unbind_driver "$bdf"
596		fi
597	done
598
599	echo "1" > "/sys/bus/pci/rescan"
600}
601
602function reset_linux() {
603	reset_linux_pci
604	for mount in $(linux_hugetlbfs_mounts); do
605		for hp in "$mount"/spdk*map_*; do
606			flock -n "$hp" true && rm -f "$hp"
607		done
608	done
609	rm -f /run/.spdk*
610}
611
612function status_linux() {
613	echo "Hugepages" >&2
614	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
615
616	numa_nodes=0
617	shopt -s nullglob
618	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
619		numa_nodes=$((numa_nodes + 1))
620		free_pages=$(cat $path/free_hugepages)
621		all_pages=$(cat $path/nr_hugepages)
622
623		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
624
625		node=${BASH_REMATCH[1]}
626		huge_size=${BASH_REMATCH[2]}
627
628		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
629	done
630	shopt -u nullglob
631
632	# fall back to system-wide hugepages
633	if [ "$numa_nodes" = "0" ]; then
634		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
635		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
636		node="-"
637		huge_size="$HUGEPGSZ"
638
639		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
640	fi
641
642	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
643		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
644
645	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
646
647	for bdf in "${sorted_bdfs[@]}"; do
648		driver=${drivers_d["$bdf"]}
649		if [ "$numa_nodes" = "0" ]; then
650			node="-"
651		else
652			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
653			if ((node == -1)); then
654				node=unknown
655			fi
656		fi
657		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
658			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
659		else
660			name="-"
661		fi
662
663		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
664			blknames=($(get_block_dev_from_bdf "$bdf"))
665		else
666			blknames=("-")
667		fi
668
669		desc=""
670		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
671		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
672		desc=${desc:-${dsa_d["$bdf"]:+DSA}}
673		desc=${desc:-${iaa_d["$bdf"]:+IAA}}
674		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
675		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
676
677		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
678			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
679			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
680	done
681}
682
683function status_freebsd() {
684	local pci
685
686	status_print() (
687		local type=$1
688		local dev driver
689
690		shift
691
692		for pci; do
693			printf '%-8s %-15s %-6s %-6s %-16s\n' \
694				"$type" \
695				"$pci" \
696				"${pci_ids_vendor["$pci"]}" \
697				"${pci_ids_device["$pci"]}" \
698				"${pci_bus_driver["$pci"]}"
699		done | sort -k2,2
700	)
701
702	local contigmem=present
703	local contigmem_buffer_size
704	local contigmem_num_buffers
705
706	if ! kldstat -q -m contigmem; then
707		contigmem="not present"
708	fi
709	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
710		contigmem_buffer_size="not set"
711	fi
712	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
713		contigmem_num_buffers="not set"
714	fi
715
716	cat <<- BSD_INFO
717		Contigmem ($contigmem)
718		Buffer Size: $contigmem_buffer_size
719		Num Buffers: $contigmem_num_buffers
720
721	BSD_INFO
722
723	printf '\n%-8s %-15s %-6s %-6s %-16s\n' \
724		"Type" "BDF" "Vendor" "Device" "Driver" >&2
725
726	status_print "NVMe" "${!nvme_d[@]}"
727	status_print "I/OAT" "${!ioat_d[@]}"
728	status_print "DSA" "${!dsa_d[@]}"
729	status_print "IAA" "${!iaa_d[@]}"
730	status_print "VMD" "${!vmd_d[@]}"
731}
732
733function configure_freebsd_pci() {
734	local BDFS
735
736	BDFS+=("${!nvme_d[@]}")
737	BDFS+=("${!ioat_d[@]}")
738	BDFS+=("${!dsa_d[@]}")
739	BDFS+=("${!iaa_d[@]}")
740	BDFS+=("${!vmd_d[@]}")
741
742	# Drop the domain part from all the addresses
743	BDFS=("${BDFS[@]#*:}")
744
745	local IFS=","
746	kldunload nic_uio.ko || true
747	kenv hw.nic_uio.bdfs="${BDFS[*]}"
748	kldload nic_uio.ko
749}
750
751function configure_freebsd() {
752	if ! check_for_driver_freebsd; then
753		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
754		return 1
755	fi
756	configure_freebsd_pci
757	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
758	#  previous value, unload contigmem so that we can reload with the new value.
759	if kldstat -q -m contigmem; then
760		# contigmem may be loaded, but the kernel environment doesn't have to
761		# be necessarily set at this point. If it isn't, kenv will fail to
762		# pick up the hw. options. Handle it.
763		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
764			contigmem_num_buffers=-1
765		fi 2> /dev/null
766		if ((contigmem_num_buffers != HUGEMEM / 256)); then
767			kldunload contigmem.ko
768		fi
769	fi
770	if ! kldstat -q -m contigmem; then
771		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
772		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
773		kldload contigmem.ko
774	fi
775}
776
777function reset_freebsd() {
778	kldunload contigmem.ko || true
779	kldunload nic_uio.ko || true
780}
781
782CMD=reset cache_pci_bus
783
784mode=$1
785
786if [ -z "$mode" ]; then
787	mode="config"
788fi
789
790: ${HUGEMEM:=2048}
791: ${PCI_ALLOWED:=""}
792: ${PCI_BLOCKED:=""}
793
794if [ -n "$NVME_ALLOWED" ]; then
795	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
796fi
797
798if [ -n "$SKIP_PCI" ]; then
799	PCI_ALLOWED="none"
800fi
801
802if [ -z "$TARGET_USER" ]; then
803	TARGET_USER="$SUDO_USER"
804	if [ -z "$TARGET_USER" ]; then
805		TARGET_USER=$(logname 2> /dev/null) || true
806	fi
807fi
808
809collect_devices "$mode"
810
811if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
812	# Note that this will wait only for the first block device attached to
813	# a given storage controller. For nvme this may miss some of the devs
814	# in case multiple namespaces are being in place.
815	# FIXME: Wait for nvme controller(s) to be in live state and determine
816	# number of configured namespaces, build list of potential block devs
817	# and pass them to sync_dev_uevents. Is it worth the effort?
818	bdfs_to_wait_for=()
819	for bdf in "${!all_devices_d[@]}"; do
820		((all_devices_d["$bdf"] == 0)) || continue
821		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
822			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
823			bdfs_to_wait_for+=("$bdf")
824		fi
825	done
826	if ((${#bdfs_to_wait_for[@]} > 0)); then
827		echo "Waiting for block devices as requested"
828		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
829		"$rootdir/scripts/sync_dev_uevents.sh" \
830			block/disk \
831			"${bdfs_to_wait_for[@]}" &
832		sync_pid=$!
833	fi
834fi
835
836if [[ $os == Linux ]]; then
837	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
838		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
839		unset -v HUGEPGSZ
840	fi
841
842	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
843	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
844	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
845
846	if [ "$mode" == "config" ]; then
847		configure_linux
848	elif [ "$mode" == "cleanup" ]; then
849		cleanup_linux
850		clear_hugepages
851	elif [ "$mode" == "reset" ]; then
852		reset_linux
853	elif [ "$mode" == "status" ]; then
854		status_linux
855	elif [ "$mode" == "help" ]; then
856		usage $0
857	else
858		usage $0 "Invalid argument '$mode'"
859	fi
860else
861	if [ "$mode" == "config" ]; then
862		configure_freebsd
863	elif [ "$mode" == "reset" ]; then
864		reset_freebsd
865	elif [ "$mode" == "cleanup" ]; then
866		echo "setup.sh cleanup function not yet supported on $os"
867	elif [ "$mode" == "status" ]; then
868		status_freebsd
869	elif [ "$mode" == "help" ]; then
870		usage $0
871	else
872		usage $0 "Invalid argument '$mode'"
873	fi
874fi
875
876if [[ -e /proc/$sync_pid/status ]]; then
877	wait "$sync_pid"
878fi
879