xref: /spdk/scripts/setup.sh (revision c39647df83e4be9bcc49025132c48bf2414ef8b1)
1#!/usr/bin/env bash
2
3set -e
4
5os=$(uname -s)
6
7if [[ $os != Linux && $os != FreeBSD ]]; then
8	echo "Not supported platform ($os), aborting"
9	exit 1
10fi
11
12rootdir=$(readlink -f $(dirname $0))/..
13source "$rootdir/scripts/common.sh"
14
15function usage() {
16	if [[ $os == Linux ]]; then
17		options="[config|reset|status|cleanup|help]"
18	else
19		options="[config|reset|help]"
20	fi
21
22	[[ -n $2 ]] && (
23		echo "$2"
24		echo ""
25	)
26	echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices"
27	echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script"
28	echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
29	echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
30	echo "Usage: $(basename $1) $options"
31	echo
32	echo "$options - as following:"
33	echo "config            Default mode. Allocate hugepages and bind PCI devices."
34	if [[ $os == Linux ]]; then
35		echo "cleanup           Remove any orphaned files that can be left in the system after SPDK application exit"
36	fi
37	echo "reset             Rebind PCI devices back to their original drivers."
38	echo "                  Also cleanup any leftover spdk files/resources."
39	echo "                  Hugepage memory size will remain unchanged."
40	if [[ $os == Linux ]]; then
41		echo "status            Print status of all SPDK-compatible devices on the system."
42	fi
43	echo "help              Print this help message."
44	echo
45	echo "The following environment variables can be specified."
46	echo "HUGEMEM           Size of hugepage memory to allocate (in MB). 2048 by default."
47	echo "                  For NUMA systems, the hugepages will be distributed on node0 by"
48	echo "                  default."
49	echo "HUGE_EVEN_ALLOC   If set to 'yes', hugepages will be evenly distributed across all"
50	echo "                  system's NUMA nodes (effectively ignoring anything set in HUGENODE)."
51	echo "                  Uses kernel's default for hugepages size."
52	echo "NRHUGE            Number of hugepages to allocate. This variable overwrites HUGEMEM."
53	echo "HUGENODE          Specific NUMA node to allocate hugepages on. Multiple nodes can be"
54	echo "                  separated with comas. By default, NRHUGE will be applied on each node."
55	echo "                  Hugepages can be defined per node with e.g.:"
56	echo "                  HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate"
57	echo "                  2048 pages for node0, 512 for node1 and default NRHUGE for node2."
58	echo "HUGEPGSZ          Size of the hugepages to use in kB. If not set, kernel's default"
59	echo "                  setting is used."
60	echo "CLEAR_HUGE        If set to 'yes', the attempt to remove hugepages from all nodes will"
61	echo "                  be made prior to allocation".
62	echo "PCI_ALLOWED"
63	echo "PCI_BLOCKED       Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)."
64	echo "                  Each device must be specified as a full PCI address."
65	echo "                  E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\""
66	echo "                  To block all PCI devices: PCI_ALLOWED=\"none\""
67	echo "                  To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\""
68	echo "                  To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\""
69	echo "                  If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices"
70	echo "                  will be bound."
71	echo "                  Each device in PCI_BLOCKED will be ignored (driver won't be changed)."
72	echo "                  PCI_BLOCKED has precedence over PCI_ALLOWED."
73	echo "TARGET_USER       User that will own hugepage mountpoint directory and vfio groups."
74	echo "                  By default the current user will be used."
75	echo "DRIVER_OVERRIDE   Disable automatic vfio-pci/uio_pci_generic selection and forcefully"
76	echo "                  bind devices to the given driver."
77	echo "                  E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko"
78	echo "PCI_BLOCK_SYNC_ON_RESET"
79	echo "                  If set in the environment, the attempt to wait for block devices associated"
80	echo "                  with given PCI device will be made upon reset"
81	exit 0
82}
83
84# In monolithic kernels the lsmod won't work. So
85# back that with a /sys/modules. We also check
86# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might
87# contain needed info (like in Fedora-like OS).
88function check_for_driver() {
89	if lsmod | grep -q ${1//-/_}; then
90		return 1
91	fi
92
93	if [[ -d /sys/module/${1} || -d \
94		/sys/module/${1//-/_} || -d \
95		/sys/bus/pci/drivers/${1} || -d \
96		/sys/bus/pci/drivers/${1//-/_} ]]; then
97		return 2
98	fi
99	return 0
100}
101
102function check_for_driver_freebsd() {
103	# Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path.
104	local search_paths path driver
105	IFS=";" read -ra search_paths < <(kldconfig -rU)
106
107	for driver in contigmem.ko nic_uio.ko; do
108		for path in "${search_paths[@]}"; do
109			[[ -f $path/$driver ]] && continue 2
110		done
111		return 1
112	done
113	return 0
114}
115
116function pci_dev_echo() {
117	local bdf="$1"
118	shift
119	echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*"
120}
121
122function linux_bind_driver() {
123	bdf="$1"
124	driver_name="$2"
125	old_driver_name=${drivers_d["$bdf"]:-no driver}
126	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
127
128	if [[ $driver_name == "$old_driver_name" ]]; then
129		pci_dev_echo "$bdf" "Already using the $old_driver_name driver"
130		return 0
131	fi
132
133	if [[ $old_driver_name != "no driver" ]]; then
134		echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
135		echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
136	fi
137
138	pci_dev_echo "$bdf" "$old_driver_name -> $driver_name"
139
140	echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
141	echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
142
143	if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then
144		# Check if the uio_pci_generic driver is broken as it might be in
145		# some 4.18.x kernels (see centos8 for instance) - if our device
146		# didn't get a proper uio entry, fallback to igb_uio
147		if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then
148			pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio"
149			drivers_d["$bdf"]="no driver"
150			# This call will override $driver_name for remaining devices as well
151			linux_bind_driver "$bdf" igb_uio
152		fi
153	fi
154
155	iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
156	if [ -e "/dev/vfio/$iommu_group" ]; then
157		if [ -n "$TARGET_USER" ]; then
158			chown "$TARGET_USER" "/dev/vfio/$iommu_group"
159		fi
160	fi
161}
162
163function linux_unbind_driver() {
164	local bdf="$1"
165	local ven_dev_id
166	ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}"
167	local old_driver_name=${drivers_d["$bdf"]:-no driver}
168
169	if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then
170		echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true
171		echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind"
172	fi
173
174	pci_dev_echo "$bdf" "$old_driver_name -> no driver"
175}
176
177function linux_hugetlbfs_mounts() {
178	mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
179}
180
181function get_block_dev_from_bdf() {
182	local bdf=$1
183	local block
184
185	for block in /sys/block/*; do
186		if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then
187			echo "${block##*/}"
188		fi
189	done
190}
191
192function get_mounted_part_dev_from_bdf_block() {
193	local bdf=$1
194	local blocks block dev mount
195
196	hash lsblk || return 1
197	blocks=($(get_block_dev_from_bdf "$bdf"))
198
199	for block in "${blocks[@]}"; do
200		while read -r dev mount; do
201			if [[ -e $mount ]]; then
202				echo "$block:$dev"
203			fi
204		done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block")
205	done
206}
207
208function collect_devices() {
209	# NVMe, IOAT, IDXD, VIRTIO, VMD
210
211	local ids dev_type dev_id bdf bdfs in_use driver
212
213	ids+="PCI_DEVICE_ID_INTEL_IOAT"
214	ids+="|PCI_DEVICE_ID_INTEL_IDXD"
215	ids+="|PCI_DEVICE_ID_VIRTIO"
216	ids+="|PCI_DEVICE_ID_INTEL_VMD"
217	ids+="|SPDK_PCI_CLASS_NVME"
218
219	local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d
220
221	while read -r _ dev_type dev_id; do
222		bdfs=(${pci_bus_cache["0x8086:$dev_id"]})
223		[[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]})
224		[[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]})
225		[[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,}
226		for bdf in "${bdfs[@]}"; do
227			in_use=0
228			if [[ $1 != status ]]; then
229				if ! pci_can_use "$bdf"; then
230					pci_dev_echo "$bdf" "Skipping denied controller at $bdf"
231					in_use=1
232				fi
233				if [[ $dev_type == nvme || $dev_type == virtio ]]; then
234					if ! verify_bdf_mounts "$bdf"; then
235						in_use=1
236					fi
237				fi
238				if [[ $dev_type == vmd ]]; then
239					if [[ $PCI_ALLOWED != *"$bdf"* ]]; then
240						pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf"
241						in_use=1
242					fi
243				fi
244			fi
245			eval "${dev_type}_d[$bdf]=$in_use"
246			all_devices_d["$bdf"]=$in_use
247			if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then
248				driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver")
249				drivers_d["$bdf"]=${driver##*/}
250			fi
251		done
252	done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h")
253}
254
255function collect_driver() {
256	local bdf=$1
257	local drivers driver
258
259	[[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1
260	if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then
261		# Pick first entry in case multiple aliases are bound to a driver.
262		driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*)
263		driver=${driver##*/}
264	else
265		[[ -n ${nvme_d["$bdf"]} ]] && driver=nvme
266		[[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma
267		[[ -n ${idxd_d["$bdf"]} ]] && driver=idxd
268		[[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci
269		[[ -n ${vmd_d["$bdf"]} ]] && driver=vmd
270	fi 2> /dev/null
271	echo "$driver"
272}
273
274function verify_bdf_mounts() {
275	local bdf=$1
276	local blknames
277	blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) || return 1
278
279	if ((${#blknames[@]} > 0)); then
280		local IFS=","
281		pci_dev_echo "$bdf" "Active mountpoints on ${blknames[*]}, so not binding PCI dev"
282		return 1
283	fi
284}
285
286function configure_linux_pci() {
287	local driver_path=""
288	driver_name=""
289	igb_uio_fallback=""
290
291	if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then
292		# igb_uio is a common driver to override with and it depends on uio.
293		modprobe uio || true
294		if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then
295			igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"
296		fi
297	fi
298
299	if [[ -n "${DRIVER_OVERRIDE}" ]]; then
300		driver_path="$DRIVER_OVERRIDE"
301		driver_name="${DRIVER_OVERRIDE##*/}"
302		# modprobe and the sysfs don't use the .ko suffix.
303		driver_name=${driver_name%.ko}
304		# path = name -> there is no path
305		if [[ "$driver_path" = "$driver_name" ]]; then
306			driver_path=""
307		fi
308	elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \
309	/sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \
310	"$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then
311		driver_name=vfio-pci
312		# Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this
313		# should be done automatically by modprobe since this particular module should
314		# be a part of vfio-pci dependencies, however, on some distros, it seems that
315		# it's not the case. See #1689.
316		if modinfo vfio_iommu_type1 > /dev/null; then
317			modprobe vfio_iommu_type1
318		fi
319	elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then
320		driver_name=uio_pci_generic
321	elif [[ -e $igb_uio_fallback ]]; then
322		driver_path="$igb_uio_fallback"
323		driver_name="igb_uio"
324		echo "WARNING: uio_pci_generic not detected - using $driver_name"
325	else
326		echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules."
327		return 1
328	fi
329
330	# modprobe assumes the directory of the module. If the user passes in a path, we should use insmod
331	if [[ -n "$driver_path" ]]; then
332		insmod $driver_path || true
333	else
334		modprobe $driver_name
335	fi
336
337	for bdf in "${!all_devices_d[@]}"; do
338		if ((all_devices_d["$bdf"] == 0)); then
339			if [[ -n ${nvme_d["$bdf"]} ]]; then
340				# Some nvme controllers may take significant amount of time while being
341				# unbound from the driver. Put that task into background to speed up the
342				# whole process. Currently this is done only for the devices bound to the
343				# nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being
344				# unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041.
345				linux_bind_driver "$bdf" "$driver_name" &
346			else
347				linux_bind_driver "$bdf" "$driver_name"
348			fi
349		fi
350	done
351	wait
352
353	echo "1" > "/sys/bus/pci/rescan"
354}
355
356function cleanup_linux() {
357	shopt -s extglob nullglob
358	dirs_to_clean=""
359	dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) "
360	if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then
361		dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) "
362	fi
363
364	files_to_clean="" file_locks=()
365	for dir in $dirs_to_clean; do
366		files_to_clean+="$(echo $dir/*) "
367	done
368	file_locks+=(/var/tmp/spdk_pci_lock*)
369	shopt -u extglob nullglob
370
371	files_to_clean+="$(ls -1 /dev/shm/* \
372		| grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) "
373	files_to_clean+=" ${file_locks[*]}"
374	files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)"
375	if [[ -z "$files_to_clean" ]]; then
376		echo "Clean"
377		return 0
378	fi
379
380	shopt -s extglob
381	for fd_dir in $(echo /proc/+([0-9])); do
382		opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)"
383	done
384	shopt -u extglob
385
386	if [[ -z "$opened_files" ]]; then
387		echo "Can't get list of opened files!"
388		exit 1
389	fi
390
391	echo 'Cleaning'
392	for f in $files_to_clean; do
393		if ! echo "$opened_files" | grep -E -q "^$f\$"; then
394			echo "Removing:    $f"
395			rm $f
396		else
397			echo "Still open: $f"
398		fi
399	done
400
401	for dir in $dirs_to_clean; do
402		if ! echo "$opened_files" | grep -E -q "^$dir\$"; then
403			echo "Removing:    $dir"
404			rmdir $dir
405		else
406			echo "Still open: $dir"
407		fi
408	done
409	echo "Clean"
410
411	unset dirs_to_clean files_to_clean opened_files
412}
413
414check_hugepages_alloc() {
415	local hp_int=$1
416	local allocated_hugepages
417
418	echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int"
419
420	allocated_hugepages=$(< "$hp_int")
421	if ((allocated_hugepages < NRHUGE)); then
422		cat <<- ERROR
423
424			## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}.
425			## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine.
426		ERROR
427		return 1
428	fi
429}
430
431clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; }
432
433configure_linux_hugepages() {
434	local node system_nodes
435	local nodes_to_use nodes_hp
436
437	if [[ $CLEAR_HUGE == yes ]]; then
438		clear_hugepages
439	fi
440
441	if [[ $HUGE_EVEN_ALLOC == yes ]]; then
442		clear_hugepages
443		check_hugepages_alloc /proc/sys/vm/nr_hugepages
444		return 0
445	fi
446
447	for node in /sys/devices/system/node/node*; do
448		[[ -e $node ]] || continue
449		nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages
450	done
451
452	if ((${#nodes[@]} == 0)); then
453		# No NUMA support? Fallback to common interface
454		check_hugepages_alloc /proc/sys/vm/nr_hugepages
455		return 0
456	fi
457
458	IFS="," read -ra nodes_to_use <<< "$HUGENODE"
459	if ((${#nodes_to_use[@]} == 0)); then
460		nodes_to_use[0]=0
461	fi
462
463	# Align indexes with node ids
464	for node in "${!nodes_to_use[@]}"; do
465		if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then
466			eval "${nodes_to_use[node]}"
467		elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then
468			nodes_hp[nodes_to_use[node]]=$NRHUGE
469		fi
470	done
471
472	for node in "${!nodes_hp[@]}"; do
473		if [[ -z ${nodes[node]} ]]; then
474			echo "Node $node doesn't exist, ignoring" >&2
475			continue
476		fi
477		NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node"
478	done
479}
480
481function configure_linux() {
482	configure_linux_pci
483	hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
484
485	if [ -z "$hugetlbfs_mounts" ]; then
486		hugetlbfs_mounts=/mnt/huge
487		echo "Mounting hugetlbfs at $hugetlbfs_mounts"
488		mkdir -p "$hugetlbfs_mounts"
489		mount -t hugetlbfs nodev "$hugetlbfs_mounts"
490	fi
491
492	configure_linux_hugepages
493
494	if [ "$driver_name" = "vfio-pci" ]; then
495		if [ -n "$TARGET_USER" ]; then
496			for mount in $hugetlbfs_mounts; do
497				chown "$TARGET_USER" "$mount"
498				chmod g+w "$mount"
499			done
500
501			MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l")
502			if [[ $MEMLOCK_AMNT != "unlimited" ]]; then
503				MEMLOCK_MB=$((MEMLOCK_AMNT / 1024))
504				cat <<- MEMLOCK
505					"$TARGET_USER" user memlock limit: $MEMLOCK_MB MB
506
507					This is the maximum amount of memory you will be
508					able to use with DPDK and VFIO if run as user "$TARGET_USER".
509					To change this, please adjust limits.conf memlock limit for user "$TARGET_USER".
510				MEMLOCK
511				if ((MEMLOCK_AMNT < 65536)); then
512					echo ""
513					echo "## WARNING: memlock limit is less than 64MB"
514					echo -n "## DPDK with VFIO may not be able to initialize "
515					echo "if run as user \"$TARGET_USER\"."
516				fi
517			fi
518		fi
519	fi
520
521	if [ ! -e /dev/cpu/0/msr ]; then
522		# Some distros build msr as a module.  Make sure it's loaded to ensure
523		#  DPDK can easily figure out the TSC rate rather than relying on 100ms
524		#  sleeps.
525		modprobe msr || true
526	fi
527}
528
529function reset_linux_pci() {
530	# virtio
531	# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
532	# Requires some more investigation - for example, some kernels do not seem to have
533	#  virtio-pci but just virtio_scsi instead.  Also need to make sure we get the
534	#  underscore vs. dash right in the virtio_scsi name.
535	modprobe virtio-pci || true
536	for bdf in "${!all_devices_d[@]}"; do
537		((all_devices_d["$bdf"] == 0)) || continue
538
539		driver=$(collect_driver "$bdf")
540		if ! check_for_driver "$driver"; then
541			linux_bind_driver "$bdf" "$driver"
542		else
543			linux_unbind_driver "$bdf"
544		fi
545	done
546
547	echo "1" > "/sys/bus/pci/rescan"
548}
549
550function reset_linux() {
551	reset_linux_pci
552	for mount in $(linux_hugetlbfs_mounts); do
553		for hp in "$mount"/spdk*map_*; do
554			flock -n "$hp" true && rm -f "$hp"
555		done
556	done
557	rm -f /run/.spdk*
558}
559
560function status_linux() {
561	echo "Hugepages" >&2
562	printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2
563
564	numa_nodes=0
565	shopt -s nullglob
566	for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do
567		numa_nodes=$((numa_nodes + 1))
568		free_pages=$(cat $path/free_hugepages)
569		all_pages=$(cat $path/nr_hugepages)
570
571		[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
572
573		node=${BASH_REMATCH[1]}
574		huge_size=${BASH_REMATCH[2]}
575
576		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
577	done
578	shopt -u nullglob
579
580	# fall back to system-wide hugepages
581	if [ "$numa_nodes" = "0" ]; then
582		free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }')
583		all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }')
584		node="-"
585		huge_size="$HUGEPGSZ"
586
587		printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
588	fi
589
590	printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
591		"Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2
592
593	sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort))
594
595	for bdf in "${sorted_bdfs[@]}"; do
596		driver=${drivers_d["$bdf"]}
597		if [ "$numa_nodes" = "0" ]; then
598			node="-"
599		else
600			node=$(cat /sys/bus/pci/devices/$bdf/numa_node)
601			if ((node == -1)); then
602				node=unknown
603			fi
604		fi
605		if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then
606			name=$(ls /sys/bus/pci/devices/$bdf/nvme)
607		else
608			name="-"
609		fi
610
611		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
612			blknames=($(get_block_dev_from_bdf "$bdf"))
613		else
614			blknames=("-")
615		fi
616
617		desc=""
618		desc=${desc:-${nvme_d["$bdf"]:+NVMe}}
619		desc=${desc:-${ioat_d["$bdf"]:+I/OAT}}
620		desc=${desc:-${idxd_d["$bdf"]:+IDXD}}
621		desc=${desc:-${virtio_d["$bdf"]:+virtio}}
622		desc=${desc:-${vmd_d["$bdf"]:+VMD}}
623
624		printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \
625			"$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \
626			"$node" "${driver:--}" "${name:-}" "${blknames[*]:--}"
627	done
628}
629
630function status_freebsd() {
631	local pci
632
633	status_print() (
634		local dev driver
635
636		echo -e "BDF\t\tVendor\tDevice\tDriver"
637
638		for pci; do
639			driver=$(pciconf -l "pci$pci")
640			driver=${driver%@*}
641			printf '%s\t%s\t%s\t%s\n' \
642				"$pci" \
643				"${pci_ids_vendor["$pci"]}" \
644				"${pci_ids_device["$pci"]}" \
645				"$driver"
646		done | sort -k1,1
647	)
648
649	local contigmem=present
650	local contigmem_buffer_size
651	local contigmem_num_buffers
652
653	if ! kldstat -q -m contigmem; then
654		contigmem="not present"
655	fi
656	if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then
657		contigmem_buffer_size="not set"
658	fi
659	if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then
660		contigmem_num_buffers="not set"
661	fi
662
663	cat <<- BSD_INFO
664		Contigmem ($contigmem)
665		Buffer Size: $contigmem_buffer_size
666		Num Buffers: $contigmem_num_buffers
667
668		NVMe devices
669		$(status_print "${!nvme_d[@]}")
670
671		I/IOAT DMA
672		$(status_print "${!ioat_d[@]}")
673
674		IDXD DMA
675		$(status_print "${!idxd_d[@]}")
676
677		VMD
678		$(status_print "${!vmd_d[@]}")
679	BSD_INFO
680}
681
682function configure_freebsd_pci() {
683	local BDFS
684
685	BDFS+=("${!nvme_d[@]}")
686	BDFS+=("${!ioat_d[@]}")
687	BDFS+=("${!idxd_d[@]}")
688	BDFS+=("${!vmd_d[@]}")
689
690	# Drop the domain part from all the addresses
691	BDFS=("${BDFS[@]#*:}")
692
693	local IFS=","
694	kldunload nic_uio.ko || true
695	kenv hw.nic_uio.bdfs="${BDFS[*]}"
696	kldload nic_uio.ko
697}
698
699function configure_freebsd() {
700	if ! check_for_driver_freebsd; then
701		echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2
702		return 1
703	fi
704	configure_freebsd_pci
705	# If contigmem is already loaded but the HUGEMEM specified doesn't match the
706	#  previous value, unload contigmem so that we can reload with the new value.
707	if kldstat -q -m contigmem; then
708		# contigmem may be loaded, but the kernel environment doesn't have to
709		# be necessarily set at this point. If it isn't, kenv will fail to
710		# pick up the hw. options. Handle it.
711		if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then
712			contigmem_num_buffers=-1
713		fi 2> /dev/null
714		if ((contigmem_num_buffers != HUGEMEM / 256)); then
715			kldunload contigmem.ko
716		fi
717	fi
718	if ! kldstat -q -m contigmem; then
719		kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
720		kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
721		kldload contigmem.ko
722	fi
723}
724
725function reset_freebsd() {
726	kldunload contigmem.ko || true
727	kldunload nic_uio.ko || true
728}
729
730CMD=reset cache_pci_bus
731
732mode=$1
733
734if [ -z "$mode" ]; then
735	mode="config"
736fi
737
738: ${HUGEMEM:=2048}
739: ${PCI_ALLOWED:=""}
740: ${PCI_BLOCKED:=""}
741
742if [ -n "$NVME_ALLOWED" ]; then
743	PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED"
744fi
745
746if [ -n "$SKIP_PCI" ]; then
747	PCI_ALLOWED="none"
748fi
749
750if [ -z "$TARGET_USER" ]; then
751	TARGET_USER="$SUDO_USER"
752	if [ -z "$TARGET_USER" ]; then
753		TARGET_USER=$(logname 2> /dev/null) || true
754	fi
755fi
756
757collect_devices "$mode"
758
759if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then
760	# Note that this will wait only for the first block device attached to
761	# a given storage controller. For nvme this may miss some of the devs
762	# in case multiple namespaces are being in place.
763	# FIXME: Wait for nvme controller(s) to be in live state and determine
764	# number of configured namespaces, build list of potential block devs
765	# and pass them to sync_dev_uevents. Is it worth the effort?
766	bdfs_to_wait_for=()
767	for bdf in "${!all_devices_d[@]}"; do
768		((all_devices_d["$bdf"] == 0)) || continue
769		if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then
770			[[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue
771			bdfs_to_wait_for+=("$bdf")
772		fi
773	done
774	if ((${#bdfs_to_wait_for[@]} > 0)); then
775		echo "Waiting for block devices as requested"
776		export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci
777		"$rootdir/scripts/sync_dev_uevents.sh" \
778			block/disk \
779			"${bdfs_to_wait_for[@]}" &
780		sync_pid=$!
781	fi
782fi
783
784if [[ $os == Linux ]]; then
785	if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then
786		echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2
787		unset -v HUGEPGSZ
788	fi
789
790	HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')}
791	HUGEPGSZ_MB=$((HUGEPGSZ / 1024))
792	: ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}
793
794	if [ "$mode" == "config" ]; then
795		configure_linux
796	elif [ "$mode" == "cleanup" ]; then
797		cleanup_linux
798		clear_hugepages
799	elif [ "$mode" == "reset" ]; then
800		reset_linux
801	elif [ "$mode" == "status" ]; then
802		status_linux
803	elif [ "$mode" == "help" ]; then
804		usage $0
805	else
806		usage $0 "Invalid argument '$mode'"
807	fi
808else
809	if [ "$mode" == "config" ]; then
810		configure_freebsd
811	elif [ "$mode" == "reset" ]; then
812		reset_freebsd
813	elif [ "$mode" == "cleanup" ]; then
814		echo "setup.sh cleanup function not yet supported on $os"
815	elif [ "$mode" == "status" ]; then
816		status_freebsd
817	elif [ "$mode" == "help" ]; then
818		usage $0
819	else
820		usage $0 "Invalid argument '$mode'"
821	fi
822fi
823
824if [[ -e /proc/$sync_pid/status ]]; then
825	wait "$sync_pid"
826fi
827