1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2016 Intel Corporation 4# All rights reserved. 5# 6set -e 7shopt -s nullglob extglob 8 9os=$(uname -s) 10 11if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14fi 15 16rootdir=$(readlink -f $(dirname $0))/.. 17source "$rootdir/scripts/common.sh" 18 19function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|interactive|help]" 22 else 23 options="[config|reset|interactive|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "interactive Executes script in interactive mode." 48 echo "help Print this help message." 49 echo 50 echo "The following environment variables can be specified." 51 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 52 echo " For NUMA systems, the hugepages will be distributed on node0 by" 53 echo " default." 54 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 55 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 56 echo " separated with comas. By default, NRHUGE will be applied on each node." 57 echo " Hugepages can be defined per node with e.g.:" 58 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 59 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 60 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 61 echo " setting is used." 62 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 63 echo " number of requested hugepages is lower from what's already" 64 echo " allocated." 65 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 66 echo " be made prior to allocation". 67 echo "PCI_ALLOWED" 68 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 69 echo " Each device must be specified as a full PCI address." 70 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 71 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 72 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 73 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 74 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 75 echo " will be bound." 76 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 77 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 78 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 79 echo " By default the current user will be used." 80 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 81 echo " bind devices to the given driver." 82 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 83 echo "PCI_BLOCK_SYNC_ON_RESET" 84 echo " If set in the environment, the attempt to wait for block devices associated" 85 echo " with given PCI device will be made upon reset" 86 echo "UNBIND_ENTIRE_IOMMU_GROUP" 87 echo " If set, all devices from nvme's iommu group will be unbound from their drivers." 88 echo " Use with caution." 89 echo "DEV_TYPE" 90 echo " Perform action only against selected type of devices. Supported:" 91 echo " IOAT|DSA|IAA|VIRTIO|VMD|NVME." 92 echo " Default is to select all types." 93 echo "FORCE_NIC_UIO_REBIND" 94 echo " When set to 'yes', an attempt to reload nic_uio will be made regardless" 95 echo " of the kernel environment. Applicable only under FreeBSD." 96 exit 0 97} 98 99# In monolithic kernels the lsmod won't work. So 100# back that with a /sys/modules. We also check 101# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 102# contain needed info (like in Fedora-like OS). 103function check_for_driver() { 104 if [[ -z $1 ]]; then 105 return 0 106 fi 107 108 if lsmod | grep -q ${1//-/_}; then 109 return 1 110 fi 111 112 if [[ -d /sys/module/${1} || 113 -d /sys/module/${1//-/_} || 114 -d /sys/bus/pci/drivers/${1} || 115 -d /sys/bus/pci/drivers/${1//-/_} ]]; then 116 return 2 117 fi 118 return 0 119} 120 121function check_for_driver_freebsd() { 122 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 123 local search_paths path driver 124 IFS=";" read -ra search_paths < <(kldconfig -rU) 125 126 for driver in contigmem.ko nic_uio.ko; do 127 for path in "${search_paths[@]}"; do 128 [[ -f $path/$driver ]] && continue 2 129 done 130 return 1 131 done 132 return 0 133} 134 135function pci_dev_echo() { 136 local bdf="$1" 137 shift 138 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 139} 140 141function probe_driver() { 142 local bdf=$1 143 local driver_name=$2 144 old_driver_name=${pci_bus_driver["$bdf"]:-no driver} 145 146 if [[ $driver_name == "$old_driver_name" ]]; then 147 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 148 return 0 149 fi 150 151 if [[ $old_driver_name != "no driver" ]]; then 152 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 153 fi 154 155 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 156 157 if [[ $driver_name == "none" ]]; then 158 return 0 159 fi 160 161 local probe_attempts=0 162 echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override" 163 while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do 164 pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)" 165 sleep 0.5 166 done 2> /dev/null 167 168 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 169 170 if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then 171 pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting" 172 return 1 173 fi 174} 175 176function linux_bind_driver() { 177 local bdf="$1" 178 local driver_name="$2" 179 180 probe_driver "$bdf" "$driver_name" 181 182 local iommu_group=${pci_iommu_groups["$bdf"]} 183 if [ -e "/dev/vfio/$iommu_group" ]; then 184 if [ -n "$TARGET_USER" ]; then 185 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 186 fi 187 fi 188 189 local iommug=("${!iommu_groups[iommu_group]}") 190 local _bdf _driver 191 if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then 192 pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!" 193 for _bdf in "${iommug[@]}"; do 194 [[ $_bdf == "$bdf" ]] && continue 195 _driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/} 196 if [[ $_driver == "$driver_name" ]]; then 197 continue 198 fi 199 # See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device() 200 pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver:-no driver})" 201 pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound" 202 if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then 203 pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}" 204 pci_bus_driver["${_bdf##*/}"]=$_driver 205 probe_driver "${_bdf##*/}" none 206 fi 207 done 208 fi 209 210} 211 212function linux_unbind_driver() { 213 local bdf="$1" 214 local old_driver_name=${pci_bus_driver["$bdf"]:-no driver} 215 216 if [[ $old_driver_name == "no driver" ]]; then 217 pci_dev_echo "$bdf" "Not bound to any driver" 218 return 0 219 fi 220 221 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 222 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 223 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 224 fi 225 226 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 227} 228 229function linux_hugetlbfs_mounts() { 230 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 231} 232 233function get_used_bdf_block_devs() { 234 local bdf=$1 235 local blocks block blockp dev mount holder 236 local used 237 238 hash lsblk &> /dev/null || return 1 239 blocks=($(get_block_dev_from_bdf "$bdf")) 240 241 for block in "${blocks[@]}"; do 242 # Check if the device is hold by some other, regardless if it's mounted 243 # or not. 244 for holder in "/sys/class/block/$block"*/holders/*; do 245 [[ -e $holder ]] || continue 246 blockp=${holder%/holders*} blockp=${blockp##*/} 247 if [[ -e $holder/slaves/$blockp ]]; then 248 used+=("holder@$blockp:${holder##*/}") 249 fi 250 done 251 while read -r dev mount; do 252 if [[ -e $mount ]]; then 253 used+=("mount@$block:$dev") 254 fi 255 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 256 if ((${#used[@]} == 0)); then 257 # Make sure we check if there's any valid data present on the target device 258 # regardless if it's being actively used or not. This is mainly done to make 259 # sure we don't miss more complex setups like ZFS pools, etc. 260 if block_in_use "$block" > /dev/null; then 261 used+=("data@$block") 262 fi 263 fi 264 done 265 266 if ((${#used[@]} > 0)); then 267 printf '%s\n' "${used[@]}" 268 fi 269} 270 271function collect_devices() { 272 local mode=$1 in_use 273 274 map_supported_devices "$DEV_TYPE" 275 276 for bdf in "${!all_devices_d[@]}"; do 277 in_use=0 278 if [[ $mode != status ]]; then 279 if ! pci_can_use "$bdf"; then 280 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 281 in_use=1 282 fi 283 fi 284 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 285 if ! verify_bdf_block_devs "$bdf"; then 286 in_use=1 287 fi 288 fi 289 if [[ -n ${vmd_d["$bdf"]} ]]; then 290 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 291 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 292 in_use=1 293 elif ((vmd_nvme_count["$bdf"] > 0)) && [[ $DRIVER_OVERRLDE != none && $mode == config ]]; then 294 cat <<- MESSAGE 295 Binding new driver to VMD device with NVMe SSDs attached to the kernel: 296 ${!vmd_nvme_d["$bdf"]} 297 The binding process may go faster if you first run this script with 298 DRIVER_OVERRIDE="none" to unbind only the NVMe SSDs, and then run 299 again to unbind the VMD devices. 300 MESSAGE 301 fi 302 fi 303 if [[ -n ${dsa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then 304 pci_dev_echo "$bdf" "Skipping not allowed DSA controller at $bdf" 305 in_use=1 306 fi 307 if [[ -n ${iaa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then 308 pci_dev_echo "$bdf" "Skipping not allowed IAA controller at $bdf" 309 in_use=1 310 fi 311 # Update in-use for each bdf. Default from the map_supported_devices() is 0 == "not used" 312 local -n type_ref=${all_devices_type_d["$bdf"]}_d 313 type_ref["$bdf"]=$in_use 314 all_devices_d["$bdf"]=$in_use 315 done 316 317 # Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are 318 # any skip them since they won't be usable by SPDK without moving the entire VMD ctrl 319 # away from the kernel first. That said, allow to touch the nvmes in case user requested 320 # all devices to be unbound from any driver or if dedicated override flag was set. 321 [[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0 322 323 for bdf in "${!nvme_d[@]}"; do 324 is_nvme_iommu_shared_with_vmd "$bdf" || continue 325 nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1 326 pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})" 327 done 328 329 get_unsupported_nic_uio_hw 330 331 return 0 332} 333 334function collect_driver() { 335 local bdf=$1 336 local drivers driver 337 338 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 339 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 340 # Pick first entry in case multiple aliases are bound to a driver. 341 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 342 driver=${driver##*/} 343 else 344 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 345 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 346 [[ -n ${dsa_d["$bdf"]} ]] && driver=idxd 347 [[ -n ${iaa_d["$bdf"]} ]] && driver=idxd 348 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 349 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 350 fi 2> /dev/null 351 echo "$driver" 352} 353 354function verify_bdf_block_devs() { 355 local bdf=$1 356 local blknames 357 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 358 359 if ((${#blknames[@]} > 0)); then 360 local IFS="," 361 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 362 return 1 363 fi 364} 365 366function configure_linux_pci() { 367 local driver_path="" 368 driver_name="" 369 igb_uio_fallback="" 370 371 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 372 # igb_uio is a common driver to override with and it depends on uio. 373 modprobe uio || true 374 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 375 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 376 fi 377 fi 378 379 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 380 driver_name=none 381 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 382 driver_path="$DRIVER_OVERRIDE" 383 driver_name="${DRIVER_OVERRIDE##*/}" 384 # modprobe and the sysfs don't use the .ko suffix. 385 driver_name=${driver_name%.ko} 386 # path = name -> there is no path 387 if [[ "$driver_path" = "$driver_name" ]]; then 388 driver_path="" 389 fi 390 elif is_iommu_enabled; then 391 driver_name=vfio-pci 392 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 393 # should be done automatically by modprobe since this particular module should 394 # be a part of vfio-pci dependencies, however, on some distros, it seems that 395 # it's not the case. See #1689. 396 if modinfo vfio_iommu_type1 > /dev/null; then 397 modprobe vfio_iommu_type1 398 fi 399 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 400 driver_name=uio_pci_generic 401 elif [[ -e $igb_uio_fallback ]]; then 402 driver_path="$igb_uio_fallback" 403 driver_name="igb_uio" 404 echo "WARNING: uio_pci_generic not detected - using $driver_name" 405 else 406 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 407 return 1 408 fi 409 410 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 411 if [[ $driver_name != "none" ]]; then 412 if [[ -n "$driver_path" ]]; then 413 insmod $driver_path || true 414 else 415 modprobe $driver_name 416 fi 417 fi 418 419 for bdf in "${!all_devices_d[@]}"; do 420 if ((all_devices_d["$bdf"] == 0)); then 421 if [[ -n ${nvme_d["$bdf"]} ]]; then 422 # Some nvme controllers may take significant amount of time while being 423 # unbound from the driver. Put that task into background to speed up the 424 # whole process. Currently this is done only for the devices bound to the 425 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 426 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 427 linux_bind_driver "$bdf" "$driver_name" & 428 else 429 linux_bind_driver "$bdf" "$driver_name" 430 fi 431 fi 432 done 433 wait 434 435 echo "1" > "/sys/bus/pci/rescan" 436} 437 438function cleanup_linux() { 439 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 440 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 441 442 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 443 if [[ -d $XDG_RUNTIME_DIR ]]; then 444 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 445 fi 446 447 for dir in "${dirs_to_clean[@]}"; do 448 files_to_clean+=("$dir/"*) 449 done 450 file_locks+=(/var/tmp/spdk_pci_lock*) 451 file_locks+=(/var/tmp/spdk_cpu_lock*) 452 453 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 454 files_to_clean+=("${file_locks[@]}") 455 456 # This may fail in case path that readlink attempts to resolve suddenly 457 # disappears (as it may happen with terminating processes). 458 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 459 460 if ((${#opened_files[@]} == 0)); then 461 echo "Can't get list of opened files!" 462 exit 1 463 fi 464 465 echo 'Cleaning' 466 for f in "${files_to_clean[@]}"; do 467 [[ -e $f ]] || continue 468 if [[ ${opened_files[*]} != *"$f"* ]]; then 469 echo "Removing: $f" 470 rm $f 471 else 472 echo "Still open: $f" 473 fi 474 done 475 476 for dir in "${dirs_to_clean[@]}"; do 477 [[ -d $dir ]] || continue 478 if [[ ${opened_files[*]} != *"$dir"* ]]; then 479 echo "Removing: $dir" 480 rmdir $dir 481 else 482 echo "Still open: $dir" 483 fi 484 done 485 echo "Clean" 486} 487 488check_hugepages_alloc() { 489 local hp_int=$1 490 local allocated_hugepages 491 492 allocated_hugepages=$(< "$hp_int") 493 494 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 495 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 496 return 0 497 fi 498 499 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 500 501 allocated_hugepages=$(< "$hp_int") 502 if ((allocated_hugepages < NRHUGE)); then 503 cat <<- ERROR 504 505 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 506 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 507 ERROR 508 return 1 509 fi 510} 511 512clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 513 514configure_linux_hugepages() { 515 local node system_nodes 516 local nodes_to_use nodes_hp 517 518 if [[ $CLEAR_HUGE == yes ]]; then 519 clear_hugepages 520 fi 521 522 if [[ -z $HUGENODE ]]; then 523 check_hugepages_alloc /proc/sys/vm/nr_hugepages 524 return 0 525 fi 526 527 for node in /sys/devices/system/node/node*; do 528 [[ -e $node ]] || continue 529 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 530 done 531 532 if ((${#nodes[@]} == 0)); then 533 # No NUMA support? Fallback to common interface 534 check_hugepages_alloc /proc/sys/vm/nr_hugepages 535 return 0 536 fi 537 538 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 539 if ((${#nodes_to_use[@]} == 0)); then 540 nodes_to_use[0]=0 541 fi 542 543 # Align indexes with node ids 544 for node in "${!nodes_to_use[@]}"; do 545 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 546 eval "${nodes_to_use[node]}" 547 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 548 nodes_hp[nodes_to_use[node]]=$NRHUGE 549 fi 550 done 551 552 for node in "${!nodes_hp[@]}"; do 553 if [[ -z ${nodes[node]} ]]; then 554 echo "Node $node doesn't exist, ignoring" >&2 555 continue 556 fi 557 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 558 done 559} 560 561function configure_linux() { 562 configure_linux_pci 563 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 564 565 if [ -z "$hugetlbfs_mounts" ]; then 566 hugetlbfs_mounts=/mnt/huge 567 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 568 mkdir -p "$hugetlbfs_mounts" 569 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 570 fi 571 572 configure_linux_hugepages 573 574 if [ "$driver_name" = "vfio-pci" ]; then 575 if [ -n "$TARGET_USER" ]; then 576 for mount in $hugetlbfs_mounts; do 577 chown "$TARGET_USER" "$mount" 578 chmod g+w "$mount" 579 done 580 581 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 582 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 583 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 584 cat <<- MEMLOCK 585 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 586 587 This is the maximum amount of memory you will be 588 able to use with DPDK and VFIO if run as user "$TARGET_USER". 589 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 590 MEMLOCK 591 if ((MEMLOCK_AMNT < 65536)); then 592 echo "" 593 echo "## WARNING: memlock limit is less than 64MB" 594 echo -n "## DPDK with VFIO may not be able to initialize " 595 echo "if run as user \"$TARGET_USER\"." 596 fi 597 fi 598 fi 599 fi 600 601 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 602 # Some distros build msr as a module. Make sure it's loaded to ensure 603 # DPDK can easily figure out the TSC rate rather than relying on 100ms 604 # sleeps. 605 modprobe msr &> /dev/null || true 606 fi 607} 608 609function reset_linux_pci() { 610 # virtio 611 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 612 # Requires some more investigation - for example, some kernels do not seem to have 613 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 614 # underscore vs. dash right in the virtio_scsi name. 615 modprobe virtio-pci || true 616 for bdf in "${!all_devices_d[@]}"; do 617 ((all_devices_d["$bdf"] == 0)) || continue 618 619 driver=$(collect_driver "$bdf") 620 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 621 linux_bind_driver "$bdf" "$driver" 622 else 623 linux_unbind_driver "$bdf" 624 fi 625 done 626 627 echo "1" > "/sys/bus/pci/rescan" 628} 629 630function reset_linux() { 631 reset_linux_pci 632 for mount in $(linux_hugetlbfs_mounts); do 633 for hp in "$mount"/spdk*map_*; do 634 flock -n "$hp" true && rm -f "$hp" 635 done 636 done 637 rm -f /run/.spdk* 638} 639 640function status_linux() { 641 echo "Hugepages" >&2 642 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 643 644 numa_nodes=0 645 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 646 numa_nodes=$((numa_nodes + 1)) 647 free_pages=$(cat $path/free_hugepages) 648 all_pages=$(cat $path/nr_hugepages) 649 650 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 651 652 node=${BASH_REMATCH[1]} 653 huge_size=${BASH_REMATCH[2]} 654 655 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 656 done 657 658 # fall back to system-wide hugepages 659 if [ "$numa_nodes" = "0" ]; then 660 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 661 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 662 node="-" 663 huge_size="$HUGEPGSZ" 664 665 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 666 fi 667 668 printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 669 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 670 671 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 672 673 for bdf in "${sorted_bdfs[@]}"; do 674 driver=${pci_bus_driver["$bdf"]} 675 if [ "$numa_nodes" = "0" ]; then 676 node="-" 677 else 678 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 679 if ((node == -1)); then 680 node=unknown 681 fi 682 fi 683 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 684 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 685 else 686 name="-" 687 fi 688 689 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 690 blknames=($(get_block_dev_from_bdf "$bdf")) 691 else 692 blknames=("-") 693 fi 694 695 desc="" 696 desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}} 697 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 698 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 699 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 700 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 701 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 702 703 printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 704 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 705 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 706 done 707} 708 709function status_freebsd() { 710 local pci 711 712 status_print() ( 713 local type=$1 714 local dev driver 715 716 shift 717 718 for pci; do 719 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 720 "$type" \ 721 "$pci" \ 722 "${pci_ids_vendor["$pci"]}" \ 723 "${pci_ids_device["$pci"]}" \ 724 "${pci_bus_driver["$pci"]}" 725 done | sort -k2,2 726 ) 727 728 local contigmem=present 729 local contigmem_buffer_size 730 local contigmem_num_buffers 731 732 if ! kldstat -q -m contigmem; then 733 contigmem="not present" 734 fi 735 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 736 contigmem_buffer_size="not set" 737 fi 738 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 739 contigmem_num_buffers="not set" 740 fi 741 742 cat <<- BSD_INFO 743 Contigmem ($contigmem) 744 Buffer Size: $contigmem_buffer_size 745 Num Buffers: $contigmem_num_buffers 746 747 BSD_INFO 748 749 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 750 "Type" "BDF" "Vendor" "Device" "Driver" >&2 751 752 status_print "NVMe" "${!nvme_d[@]}" 753 status_print "I/OAT" "${!ioat_d[@]}" 754 status_print "DSA" "${!dsa_d[@]}" 755 status_print "IAA" "${!iaa_d[@]}" 756 status_print "VMD" "${!vmd_d[@]}" 757} 758 759function configure_freebsd_pci() { 760 local BDFS 761 762 BDFS+=("$@") 763 764 if ((${#unsupported_nic_uio_hw[@]} > 0)) && [[ $FORCE_NIC_UIO_REBIND != yes ]]; then 765 warn_unsupported_nic_uio_hw 766 return 1 767 fi 768 769 BDFS+=("${unsupported_nic_uio_hw[@]}") 770 771 if kldstat -n nic_uio &> /dev/null; then 772 kldunload nic_uio.ko 773 fi 774 775 local IFS="," 776 kenv hw.nic_uio.bdfs="${BDFS[*]}" 777 kldload nic_uio.ko 778} 779 780function get_unsupported_nic_uio_hw() { 781 local bdfs bdf all_devices 782 local -g unsupported_nic_uio_hw 783 784 IFS="," read -ra bdfs < <(kenv hw.nic_uio.bdfs 2> /dev/null) || return 0 785 786 for bdf in "${bdfs[@]}"; do 787 grep -q "$bdf" <(printf '%s\n' "${!all_devices_d[@]}") || unsupported_nic_uio_hw+=("$bdf") 788 done 789 790 return 0 791} 792 793function warn_unsupported_nic_uio_hw() { 794 cat <<- NIC_UIO 795 796 WARNING: Unsupported devices detected in the nic_uio setup: 797 798 $(printf ' %s\n' "${unsupported_nic_uio_hw[@]}") 799 800 Remove them first or pass FORCE_NIC_UIO_REBIND=yes through the environment. 801 802 NIC_UIO 803} 804 805function configure_freebsd() { 806 _configure_freebsd "${!nvme_d[@]}" "${!ioat_d[@]}" "${!dsa_d[@]}" "${!iaa_d[@]}" "${!vmd_d[@]}" 807} 808 809function _configure_freebsd() { 810 if ! check_for_driver_freebsd; then 811 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 812 return 1 813 fi 814 configure_freebsd_pci "$@" 815 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 816 # previous value, unload contigmem so that we can reload with the new value. 817 if kldstat -q -m contigmem; then 818 # contigmem may be loaded, but the kernel environment doesn't have to 819 # be necessarily set at this point. If it isn't, kenv will fail to 820 # pick up the hw. options. Handle it. 821 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 822 contigmem_num_buffers=-1 823 fi 2> /dev/null 824 if ((contigmem_num_buffers != HUGEMEM / 256)); then 825 kldunload contigmem.ko 826 fi 827 fi 828 if ! kldstat -q -m contigmem; then 829 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 830 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 831 kldload contigmem.ko 832 fi 833} 834 835function reset_freebsd() { 836 # Don't reap the entire nic_uio setup in case there are unsupported devices in the kernel env 837 if ((${#unsupported_nic_uio_hw[@]} > 0)) && [[ $FORCE_NIC_UIO_REBIND != yes ]]; then 838 warn_unsupported_nic_uio_hw 839 return 1 840 fi 841 842 kldunload contigmem.ko || true 843 kldunload nic_uio.ko || true 844 845 if ((${#unsupported_nic_uio_hw[@]} > 0)); then 846 # HACK: try to be nice and recreate the setup but only with the unsupported devices 847 _unsupported_nic_uio_hw=("${unsupported_nic_uio_hw[@]}") unsupported_nic_uio_hw=() 848 _configure_freebsd "${_unsupported_nic_uio_hw[@]}" 849 fi 850} 851 852function set_hp() { 853 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 854 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 855 unset -v HUGEPGSZ 856 fi 857 858 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 859 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 860 NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 861} 862 863kmsg "spdk: $0 $* (start)" 864 865CMD=reset cache_pci_bus 866 867mode=$1 868 869if [ -z "$mode" ]; then 870 mode="config" 871fi 872 873: ${HUGEMEM:=2048} 874: ${PCI_ALLOWED:=""} 875: ${PCI_BLOCKED:=""} 876 877if [ -n "$NVME_ALLOWED" ]; then 878 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 879fi 880 881if [ -n "$SKIP_PCI" ]; then 882 PCI_ALLOWED="none" 883fi 884 885if [ -z "$TARGET_USER" ]; then 886 TARGET_USER="$SUDO_USER" 887 if [ -z "$TARGET_USER" ]; then 888 TARGET_USER=$(logname 2> /dev/null) || true 889 fi 890fi 891 892collect_devices "$mode" 893 894if [[ $os == Linux ]]; then 895 set_hp 896fi 897 898if [[ $mode == interactive ]]; then 899 source "$rootdir/scripts/common/setup/interactive.sh" 900 main_menu "$2" || exit 0 901fi 902 903if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 904 # Note that this will wait only for the first block device attached to 905 # a given storage controller. For nvme this may miss some of the devs 906 # in case multiple namespaces are being in place. 907 # FIXME: Wait for nvme controller(s) to be in live state and determine 908 # number of configured namespaces, build list of potential block devs 909 # and pass them to sync_dev_uevents. Is it worth the effort? 910 bdfs_to_wait_for=() 911 for bdf in "${!all_devices_d[@]}"; do 912 ((all_devices_d["$bdf"] == 0)) || continue 913 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 914 [[ $(collect_driver "$bdf") != "${pci_bus_driver["$bdf"]}" ]] || continue 915 bdfs_to_wait_for+=("$bdf") 916 fi 917 done 918 if ((${#bdfs_to_wait_for[@]} > 0)); then 919 echo "Waiting for block devices as requested" 920 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 921 "$rootdir/scripts/sync_dev_uevents.sh" \ 922 block/disk \ 923 "${bdfs_to_wait_for[@]}" & 924 sync_pid=$! 925 fi 926fi 927 928if [[ $os == Linux ]]; then 929 if [ "$mode" == "config" ]; then 930 configure_linux 931 elif [ "$mode" == "cleanup" ]; then 932 cleanup_linux 933 clear_hugepages 934 elif [ "$mode" == "reset" ]; then 935 reset_linux 936 elif [ "$mode" == "status" ]; then 937 status_linux 938 elif [ "$mode" == "help" ]; then 939 usage $0 940 else 941 usage $0 "Invalid argument '$mode'" 942 fi 943else 944 if [ "$mode" == "config" ]; then 945 configure_freebsd 946 elif [ "$mode" == "reset" ]; then 947 reset_freebsd 948 elif [ "$mode" == "cleanup" ]; then 949 echo "setup.sh cleanup function not yet supported on $os" 950 elif [ "$mode" == "status" ]; then 951 status_freebsd 952 elif [ "$mode" == "help" ]; then 953 usage $0 954 else 955 usage $0 "Invalid argument '$mode'" 956 fi 957fi 958 959if [[ -e /proc/$sync_pid/status ]]; then 960 wait "$sync_pid" 961fi 962 963kmsg "spdk: $0 $* (done)" 964