1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2016 Intel Corporation 4# All rights reserved. 5# 6set -e 7shopt -s nullglob extglob 8 9os=$(uname -s) 10 11if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14fi 15 16rootdir=$(readlink -f $(dirname $0))/.. 17source "$rootdir/scripts/common.sh" 18 19function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|interactive|help]" 22 else 23 options="[config|reset|interactive|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "interactive Executes script in interactive mode." 48 echo "help Print this help message." 49 echo 50 echo "The following environment variables can be specified." 51 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 52 echo " For NUMA systems, the hugepages will be distributed on node0 by" 53 echo " default." 54 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 55 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 56 echo " Uses kernel's default for hugepages size." 57 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 58 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 59 echo " separated with comas. By default, NRHUGE will be applied on each node." 60 echo " Hugepages can be defined per node with e.g.:" 61 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 62 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 63 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 64 echo " setting is used." 65 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 66 echo " number of requested hugepages is lower from what's already" 67 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 68 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 69 echo " be made prior to allocation". 70 echo "PCI_ALLOWED" 71 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 72 echo " Each device must be specified as a full PCI address." 73 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 74 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 75 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 76 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 77 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 78 echo " will be bound." 79 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 80 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 81 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 82 echo " By default the current user will be used." 83 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 84 echo " bind devices to the given driver." 85 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 86 echo "PCI_BLOCK_SYNC_ON_RESET" 87 echo " If set in the environment, the attempt to wait for block devices associated" 88 echo " with given PCI device will be made upon reset" 89 echo "UNBIND_ENTIRE_IOMMU_GROUP" 90 echo " If set, all devices from nvme's iommu group will be unbound from their drivers." 91 echo " Use with caution." 92 echo "DEV_TYPE" 93 echo " Perform action only against selected type of devices. Supported:" 94 echo " IOAT|DSA|IAA|VIRTIO|VMD|NVME." 95 echo " Default is to select all types." 96 exit 0 97} 98 99# In monolithic kernels the lsmod won't work. So 100# back that with a /sys/modules. We also check 101# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 102# contain needed info (like in Fedora-like OS). 103function check_for_driver() { 104 if [[ -z $1 ]]; then 105 return 0 106 fi 107 108 if lsmod | grep -q ${1//-/_}; then 109 return 1 110 fi 111 112 if [[ -d /sys/module/${1} || 113 -d /sys/module/${1//-/_} || 114 -d /sys/bus/pci/drivers/${1} || 115 -d /sys/bus/pci/drivers/${1//-/_} ]]; then 116 return 2 117 fi 118 return 0 119} 120 121function check_for_driver_freebsd() { 122 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 123 local search_paths path driver 124 IFS=";" read -ra search_paths < <(kldconfig -rU) 125 126 for driver in contigmem.ko nic_uio.ko; do 127 for path in "${search_paths[@]}"; do 128 [[ -f $path/$driver ]] && continue 2 129 done 130 return 1 131 done 132 return 0 133} 134 135function pci_dev_echo() { 136 local bdf="$1" 137 shift 138 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 139} 140 141function probe_driver() { 142 local bdf=$1 143 local driver_name=$2 144 old_driver_name=${pci_bus_driver["$bdf"]:-no driver} 145 146 if [[ $driver_name == "$old_driver_name" ]]; then 147 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 148 return 0 149 fi 150 151 if [[ $old_driver_name != "no driver" ]]; then 152 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 153 fi 154 155 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 156 157 if [[ $driver_name == "none" ]]; then 158 return 0 159 fi 160 161 local probe_attempts=0 162 echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override" 163 while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do 164 pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)" 165 sleep 0.5 166 done 2> /dev/null 167 168 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 169 170 if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then 171 pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting" 172 return 1 173 fi 174} 175 176function linux_bind_driver() { 177 local bdf="$1" 178 local driver_name="$2" 179 180 probe_driver "$bdf" "$driver_name" 181 182 local iommu_group=${pci_iommu_groups["$bdf"]} 183 if [ -e "/dev/vfio/$iommu_group" ]; then 184 if [ -n "$TARGET_USER" ]; then 185 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 186 fi 187 fi 188 189 local iommug=("${!iommu_groups[iommu_group]}") 190 local _bdf _driver 191 if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then 192 pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!" 193 for _bdf in "${iommug[@]}"; do 194 [[ $_bdf == "$bdf" ]] && continue 195 _driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/} 196 if [[ $_driver == "$driver_name" ]]; then 197 continue 198 fi 199 # See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device() 200 pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver:-no driver})" 201 pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound" 202 if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then 203 pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}" 204 pci_bus_driver["${_bdf##*/}"]=$_driver 205 probe_driver "${_bdf##*/}" none 206 fi 207 done 208 fi 209 210} 211 212function linux_unbind_driver() { 213 local bdf="$1" 214 local old_driver_name=${pci_bus_driver["$bdf"]:-no driver} 215 216 if [[ $old_driver_name == "no driver" ]]; then 217 pci_dev_echo "$bdf" "Not bound to any driver" 218 return 0 219 fi 220 221 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 222 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 223 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 224 fi 225 226 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 227} 228 229function linux_hugetlbfs_mounts() { 230 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 231} 232 233function get_block_dev_from_bdf() { 234 local bdf=$1 235 local block blocks=() ctrl sub 236 237 for block in /sys/block/!(nvme*); do 238 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 239 blocks+=("${block##*/}") 240 fi 241 done 242 243 blocks+=($(get_block_dev_from_nvme "$bdf")) 244 245 printf '%s\n' "${blocks[@]}" 246} 247 248function get_block_dev_from_nvme() { 249 local bdf=$1 block ctrl sub 250 251 for ctrl in /sys/class/nvme/nvme*; do 252 [[ -e $ctrl/address && $(< "$ctrl/address") == "$bdf" ]] || continue 253 sub=$(< "$ctrl/subsysnqn") && break 254 done 255 256 [[ -n $sub ]] || return 0 257 258 for block in /sys/block/nvme*; do 259 [[ -e $block/hidden && $(< "$block/hidden") == 1 ]] && continue 260 [[ $(< "$block/device/subsysnqn") == "$sub" ]] && echo "${block##*/}" 261 done 262} 263 264function get_used_bdf_block_devs() { 265 local bdf=$1 266 local blocks block blockp dev mount holder 267 local used 268 269 hash lsblk &> /dev/null || return 1 270 blocks=($(get_block_dev_from_bdf "$bdf")) 271 272 for block in "${blocks[@]}"; do 273 # Check if the device is hold by some other, regardless if it's mounted 274 # or not. 275 for holder in "/sys/class/block/$block"*/holders/*; do 276 [[ -e $holder ]] || continue 277 blockp=${holder%/holders*} blockp=${blockp##*/} 278 if [[ -e $holder/slaves/$blockp ]]; then 279 used+=("holder@$blockp:${holder##*/}") 280 fi 281 done 282 while read -r dev mount; do 283 if [[ -e $mount ]]; then 284 used+=("mount@$block:$dev") 285 fi 286 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 287 if ((${#used[@]} == 0)); then 288 # Make sure we check if there's any valid data present on the target device 289 # regardless if it's being actively used or not. This is mainly done to make 290 # sure we don't miss more complex setups like ZFS pools, etc. 291 if block_in_use "$block" > /dev/null; then 292 used+=("data@$block") 293 fi 294 fi 295 done 296 297 if ((${#used[@]} > 0)); then 298 printf '%s\n' "${used[@]}" 299 fi 300} 301 302function collect_devices() { 303 local mode=$1 in_use 304 305 map_supported_devices "$DEV_TYPE" 306 307 for bdf in "${!all_devices_d[@]}"; do 308 in_use=0 309 if [[ $mode != status ]]; then 310 if ! pci_can_use "$bdf"; then 311 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 312 in_use=1 313 fi 314 fi 315 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 316 if ! verify_bdf_block_devs "$bdf"; then 317 in_use=1 318 fi 319 fi 320 if [[ -n ${vmd_d["$bdf"]} ]]; then 321 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 322 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 323 in_use=1 324 elif ((vmd_nvme_count["$bdf"] > 0)) && [[ $DRIVER_OVERRLDE != none && $mode == config ]]; then 325 cat <<- MESSAGE 326 Binding new driver to VMD device with NVMe SSDs attached to the kernel: 327 ${!vmd_nvme_d["$bdf"]} 328 The binding process may go faster if you first run this script with 329 DRIVER_OVERRIDE="none" to unbind only the NVMe SSDs, and then run 330 again to unbind the VMD devices. 331 MESSAGE 332 fi 333 fi 334 if [[ -n ${dsa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then 335 pci_dev_echo "$bdf" "Skipping not allowed DSA controller at $bdf" 336 in_use=1 337 fi 338 if [[ -n ${iaa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then 339 pci_dev_echo "$bdf" "Skipping not allowed IAA controller at $bdf" 340 in_use=1 341 fi 342 # Update in-use for each bdf. Default from the map_supported_devices() is 0 == "not used" 343 local -n type_ref=${all_devices_type_d["$bdf"]}_d 344 type_ref["$bdf"]=$in_use 345 all_devices_d["$bdf"]=$in_use 346 done 347 348 # Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are 349 # any skip them since they won't be usable by SPDK without moving the entire VMD ctrl 350 # away from the kernel first. That said, allow to touch the nvmes in case user requested 351 # all devices to be unbound from any driver or if dedicated override flag was set. 352 [[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0 353 354 for bdf in "${!nvme_d[@]}"; do 355 is_nvme_iommu_shared_with_vmd "$bdf" || continue 356 nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1 357 pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})" 358 done 359 360 return 0 361} 362 363function collect_driver() { 364 local bdf=$1 365 local drivers driver 366 367 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 368 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 369 # Pick first entry in case multiple aliases are bound to a driver. 370 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 371 driver=${driver##*/} 372 else 373 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 374 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 375 [[ -n ${dsa_d["$bdf"]} ]] && driver=idxd 376 [[ -n ${iaa_d["$bdf"]} ]] && driver=idxd 377 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 378 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 379 fi 2> /dev/null 380 echo "$driver" 381} 382 383function verify_bdf_block_devs() { 384 local bdf=$1 385 local blknames 386 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 387 388 if ((${#blknames[@]} > 0)); then 389 local IFS="," 390 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 391 return 1 392 fi 393} 394 395function configure_linux_pci() { 396 local driver_path="" 397 driver_name="" 398 igb_uio_fallback="" 399 400 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 401 # igb_uio is a common driver to override with and it depends on uio. 402 modprobe uio || true 403 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 404 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 405 fi 406 fi 407 408 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 409 driver_name=none 410 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 411 driver_path="$DRIVER_OVERRIDE" 412 driver_name="${DRIVER_OVERRIDE##*/}" 413 # modprobe and the sysfs don't use the .ko suffix. 414 driver_name=${driver_name%.ko} 415 # path = name -> there is no path 416 if [[ "$driver_path" = "$driver_name" ]]; then 417 driver_path="" 418 fi 419 elif is_iommu_enabled; then 420 driver_name=vfio-pci 421 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 422 # should be done automatically by modprobe since this particular module should 423 # be a part of vfio-pci dependencies, however, on some distros, it seems that 424 # it's not the case. See #1689. 425 if modinfo vfio_iommu_type1 > /dev/null; then 426 modprobe vfio_iommu_type1 427 fi 428 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 429 driver_name=uio_pci_generic 430 elif [[ -e $igb_uio_fallback ]]; then 431 driver_path="$igb_uio_fallback" 432 driver_name="igb_uio" 433 echo "WARNING: uio_pci_generic not detected - using $driver_name" 434 else 435 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 436 return 1 437 fi 438 439 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 440 if [[ $driver_name != "none" ]]; then 441 if [[ -n "$driver_path" ]]; then 442 insmod $driver_path || true 443 else 444 modprobe $driver_name 445 fi 446 fi 447 448 for bdf in "${!all_devices_d[@]}"; do 449 if ((all_devices_d["$bdf"] == 0)); then 450 if [[ -n ${nvme_d["$bdf"]} ]]; then 451 # Some nvme controllers may take significant amount of time while being 452 # unbound from the driver. Put that task into background to speed up the 453 # whole process. Currently this is done only for the devices bound to the 454 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 455 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 456 linux_bind_driver "$bdf" "$driver_name" & 457 else 458 linux_bind_driver "$bdf" "$driver_name" 459 fi 460 fi 461 done 462 wait 463 464 echo "1" > "/sys/bus/pci/rescan" 465} 466 467function cleanup_linux() { 468 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 469 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 470 471 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 472 if [[ -d $XDG_RUNTIME_DIR ]]; then 473 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 474 fi 475 476 for dir in "${dirs_to_clean[@]}"; do 477 files_to_clean+=("$dir/"*) 478 done 479 file_locks+=(/var/tmp/spdk_pci_lock*) 480 file_locks+=(/var/tmp/spdk_cpu_lock*) 481 482 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 483 files_to_clean+=("${file_locks[@]}") 484 485 # This may fail in case path that readlink attempts to resolve suddenly 486 # disappears (as it may happen with terminating processes). 487 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 488 489 if ((${#opened_files[@]} == 0)); then 490 echo "Can't get list of opened files!" 491 exit 1 492 fi 493 494 echo 'Cleaning' 495 for f in "${files_to_clean[@]}"; do 496 [[ -e $f ]] || continue 497 if [[ ${opened_files[*]} != *"$f"* ]]; then 498 echo "Removing: $f" 499 rm $f 500 else 501 echo "Still open: $f" 502 fi 503 done 504 505 for dir in "${dirs_to_clean[@]}"; do 506 [[ -d $dir ]] || continue 507 if [[ ${opened_files[*]} != *"$dir"* ]]; then 508 echo "Removing: $dir" 509 rmdir $dir 510 else 511 echo "Still open: $dir" 512 fi 513 done 514 echo "Clean" 515} 516 517check_hugepages_alloc() { 518 local hp_int=$1 519 local allocated_hugepages 520 521 allocated_hugepages=$(< "$hp_int") 522 523 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 524 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 525 return 0 526 fi 527 528 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 529 530 allocated_hugepages=$(< "$hp_int") 531 if ((allocated_hugepages < NRHUGE)); then 532 cat <<- ERROR 533 534 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 535 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 536 ERROR 537 return 1 538 fi 539} 540 541clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 542 543configure_linux_hugepages() { 544 local node system_nodes 545 local nodes_to_use nodes_hp 546 547 if [[ $CLEAR_HUGE == yes ]]; then 548 clear_hugepages 549 fi 550 551 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 552 clear_hugepages 553 check_hugepages_alloc /proc/sys/vm/nr_hugepages 554 return 0 555 fi 556 557 for node in /sys/devices/system/node/node*; do 558 [[ -e $node ]] || continue 559 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 560 done 561 562 if ((${#nodes[@]} == 0)); then 563 # No NUMA support? Fallback to common interface 564 check_hugepages_alloc /proc/sys/vm/nr_hugepages 565 return 0 566 fi 567 568 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 569 if ((${#nodes_to_use[@]} == 0)); then 570 nodes_to_use[0]=0 571 fi 572 573 # Align indexes with node ids 574 for node in "${!nodes_to_use[@]}"; do 575 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 576 eval "${nodes_to_use[node]}" 577 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 578 nodes_hp[nodes_to_use[node]]=$NRHUGE 579 fi 580 done 581 582 for node in "${!nodes_hp[@]}"; do 583 if [[ -z ${nodes[node]} ]]; then 584 echo "Node $node doesn't exist, ignoring" >&2 585 continue 586 fi 587 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 588 done 589} 590 591function configure_linux() { 592 configure_linux_pci 593 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 594 595 if [ -z "$hugetlbfs_mounts" ]; then 596 hugetlbfs_mounts=/mnt/huge 597 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 598 mkdir -p "$hugetlbfs_mounts" 599 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 600 fi 601 602 configure_linux_hugepages 603 604 if [ "$driver_name" = "vfio-pci" ]; then 605 if [ -n "$TARGET_USER" ]; then 606 for mount in $hugetlbfs_mounts; do 607 chown "$TARGET_USER" "$mount" 608 chmod g+w "$mount" 609 done 610 611 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 612 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 613 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 614 cat <<- MEMLOCK 615 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 616 617 This is the maximum amount of memory you will be 618 able to use with DPDK and VFIO if run as user "$TARGET_USER". 619 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 620 MEMLOCK 621 if ((MEMLOCK_AMNT < 65536)); then 622 echo "" 623 echo "## WARNING: memlock limit is less than 64MB" 624 echo -n "## DPDK with VFIO may not be able to initialize " 625 echo "if run as user \"$TARGET_USER\"." 626 fi 627 fi 628 fi 629 fi 630 631 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 632 # Some distros build msr as a module. Make sure it's loaded to ensure 633 # DPDK can easily figure out the TSC rate rather than relying on 100ms 634 # sleeps. 635 modprobe msr &> /dev/null || true 636 fi 637} 638 639function reset_linux_pci() { 640 # virtio 641 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 642 # Requires some more investigation - for example, some kernels do not seem to have 643 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 644 # underscore vs. dash right in the virtio_scsi name. 645 modprobe virtio-pci || true 646 for bdf in "${!all_devices_d[@]}"; do 647 ((all_devices_d["$bdf"] == 0)) || continue 648 649 driver=$(collect_driver "$bdf") 650 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 651 linux_bind_driver "$bdf" "$driver" 652 else 653 linux_unbind_driver "$bdf" 654 fi 655 done 656 657 echo "1" > "/sys/bus/pci/rescan" 658} 659 660function reset_linux() { 661 reset_linux_pci 662 for mount in $(linux_hugetlbfs_mounts); do 663 for hp in "$mount"/spdk*map_*; do 664 flock -n "$hp" true && rm -f "$hp" 665 done 666 done 667 rm -f /run/.spdk* 668} 669 670function status_linux() { 671 echo "Hugepages" >&2 672 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 673 674 numa_nodes=0 675 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 676 numa_nodes=$((numa_nodes + 1)) 677 free_pages=$(cat $path/free_hugepages) 678 all_pages=$(cat $path/nr_hugepages) 679 680 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 681 682 node=${BASH_REMATCH[1]} 683 huge_size=${BASH_REMATCH[2]} 684 685 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 686 done 687 688 # fall back to system-wide hugepages 689 if [ "$numa_nodes" = "0" ]; then 690 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 691 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 692 node="-" 693 huge_size="$HUGEPGSZ" 694 695 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 696 fi 697 698 printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 699 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 700 701 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 702 703 for bdf in "${sorted_bdfs[@]}"; do 704 driver=${pci_bus_driver["$bdf"]} 705 if [ "$numa_nodes" = "0" ]; then 706 node="-" 707 else 708 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 709 if ((node == -1)); then 710 node=unknown 711 fi 712 fi 713 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 714 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 715 else 716 name="-" 717 fi 718 719 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 720 blknames=($(get_block_dev_from_bdf "$bdf")) 721 else 722 blknames=("-") 723 fi 724 725 desc="" 726 desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}} 727 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 728 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 729 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 730 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 731 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 732 733 printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 734 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 735 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 736 done 737} 738 739function status_freebsd() { 740 local pci 741 742 status_print() ( 743 local type=$1 744 local dev driver 745 746 shift 747 748 for pci; do 749 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 750 "$type" \ 751 "$pci" \ 752 "${pci_ids_vendor["$pci"]}" \ 753 "${pci_ids_device["$pci"]}" \ 754 "${pci_bus_driver["$pci"]}" 755 done | sort -k2,2 756 ) 757 758 local contigmem=present 759 local contigmem_buffer_size 760 local contigmem_num_buffers 761 762 if ! kldstat -q -m contigmem; then 763 contigmem="not present" 764 fi 765 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 766 contigmem_buffer_size="not set" 767 fi 768 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 769 contigmem_num_buffers="not set" 770 fi 771 772 cat <<- BSD_INFO 773 Contigmem ($contigmem) 774 Buffer Size: $contigmem_buffer_size 775 Num Buffers: $contigmem_num_buffers 776 777 BSD_INFO 778 779 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 780 "Type" "BDF" "Vendor" "Device" "Driver" >&2 781 782 status_print "NVMe" "${!nvme_d[@]}" 783 status_print "I/OAT" "${!ioat_d[@]}" 784 status_print "DSA" "${!dsa_d[@]}" 785 status_print "IAA" "${!iaa_d[@]}" 786 status_print "VMD" "${!vmd_d[@]}" 787} 788 789function configure_freebsd_pci() { 790 local BDFS 791 792 BDFS+=("${!nvme_d[@]}") 793 BDFS+=("${!ioat_d[@]}") 794 BDFS+=("${!dsa_d[@]}") 795 BDFS+=("${!iaa_d[@]}") 796 BDFS+=("${!vmd_d[@]}") 797 798 # Drop the domain part from all the addresses 799 BDFS=("${BDFS[@]#*:}") 800 801 local IFS="," 802 kldunload nic_uio.ko || true 803 kenv hw.nic_uio.bdfs="${BDFS[*]}" 804 kldload nic_uio.ko 805} 806 807function configure_freebsd() { 808 if ! check_for_driver_freebsd; then 809 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 810 return 1 811 fi 812 configure_freebsd_pci 813 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 814 # previous value, unload contigmem so that we can reload with the new value. 815 if kldstat -q -m contigmem; then 816 # contigmem may be loaded, but the kernel environment doesn't have to 817 # be necessarily set at this point. If it isn't, kenv will fail to 818 # pick up the hw. options. Handle it. 819 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 820 contigmem_num_buffers=-1 821 fi 2> /dev/null 822 if ((contigmem_num_buffers != HUGEMEM / 256)); then 823 kldunload contigmem.ko 824 fi 825 fi 826 if ! kldstat -q -m contigmem; then 827 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 828 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 829 kldload contigmem.ko 830 fi 831} 832 833function reset_freebsd() { 834 kldunload contigmem.ko || true 835 kldunload nic_uio.ko || true 836} 837 838function set_hp() { 839 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 840 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 841 unset -v HUGEPGSZ 842 fi 843 844 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 845 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 846 NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 847} 848 849kmsg "spdk: $0 $* (start)" 850 851CMD=reset cache_pci_bus 852 853mode=$1 854 855if [ -z "$mode" ]; then 856 mode="config" 857fi 858 859: ${HUGEMEM:=2048} 860: ${PCI_ALLOWED:=""} 861: ${PCI_BLOCKED:=""} 862 863if [ -n "$NVME_ALLOWED" ]; then 864 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 865fi 866 867if [ -n "$SKIP_PCI" ]; then 868 PCI_ALLOWED="none" 869fi 870 871if [ -z "$TARGET_USER" ]; then 872 TARGET_USER="$SUDO_USER" 873 if [ -z "$TARGET_USER" ]; then 874 TARGET_USER=$(logname 2> /dev/null) || true 875 fi 876fi 877 878collect_devices "$mode" 879 880if [[ $os == Linux ]]; then 881 set_hp 882fi 883 884if [[ $mode == interactive ]]; then 885 source "$rootdir/scripts/common/setup/interactive.sh" 886 main_menu "$2" || exit 0 887fi 888 889if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 890 # Note that this will wait only for the first block device attached to 891 # a given storage controller. For nvme this may miss some of the devs 892 # in case multiple namespaces are being in place. 893 # FIXME: Wait for nvme controller(s) to be in live state and determine 894 # number of configured namespaces, build list of potential block devs 895 # and pass them to sync_dev_uevents. Is it worth the effort? 896 bdfs_to_wait_for=() 897 for bdf in "${!all_devices_d[@]}"; do 898 ((all_devices_d["$bdf"] == 0)) || continue 899 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 900 [[ $(collect_driver "$bdf") != "${pci_bus_driver["$bdf"]}" ]] || continue 901 bdfs_to_wait_for+=("$bdf") 902 fi 903 done 904 if ((${#bdfs_to_wait_for[@]} > 0)); then 905 echo "Waiting for block devices as requested" 906 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 907 "$rootdir/scripts/sync_dev_uevents.sh" \ 908 block/disk \ 909 "${bdfs_to_wait_for[@]}" & 910 sync_pid=$! 911 fi 912fi 913 914if [[ $os == Linux ]]; then 915 if [ "$mode" == "config" ]; then 916 configure_linux 917 elif [ "$mode" == "cleanup" ]; then 918 cleanup_linux 919 clear_hugepages 920 elif [ "$mode" == "reset" ]; then 921 reset_linux 922 elif [ "$mode" == "status" ]; then 923 status_linux 924 elif [ "$mode" == "help" ]; then 925 usage $0 926 else 927 usage $0 "Invalid argument '$mode'" 928 fi 929else 930 if [ "$mode" == "config" ]; then 931 configure_freebsd 932 elif [ "$mode" == "reset" ]; then 933 reset_freebsd 934 elif [ "$mode" == "cleanup" ]; then 935 echo "setup.sh cleanup function not yet supported on $os" 936 elif [ "$mode" == "status" ]; then 937 status_freebsd 938 elif [ "$mode" == "help" ]; then 939 usage $0 940 else 941 usage $0 "Invalid argument '$mode'" 942 fi 943fi 944 945if [[ -e /proc/$sync_pid/status ]]; then 946 wait "$sync_pid" 947fi 948 949kmsg "spdk: $0 $* (done)" 950