1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2016 Intel Corporation 4# All rights reserved. 5# 6set -e 7shopt -s nullglob extglob 8 9os=$(uname -s) 10 11if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14fi 15 16rootdir=$(readlink -f $(dirname $0))/.. 17source "$rootdir/scripts/common.sh" 18 19function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|interactive|help]" 22 else 23 options="[config|reset|interactive|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "interactive Executes script in interactive mode." 48 echo "help Print this help message." 49 echo 50 echo "The following environment variables can be specified." 51 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 52 echo " For NUMA systems, the hugepages will be distributed on node0 by" 53 echo " default." 54 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 55 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 56 echo " Uses kernel's default for hugepages size." 57 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 58 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 59 echo " separated with comas. By default, NRHUGE will be applied on each node." 60 echo " Hugepages can be defined per node with e.g.:" 61 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 62 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 63 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 64 echo " setting is used." 65 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 66 echo " number of requested hugepages is lower from what's already" 67 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 68 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 69 echo " be made prior to allocation". 70 echo "PCI_ALLOWED" 71 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 72 echo " Each device must be specified as a full PCI address." 73 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 74 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 75 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 76 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 77 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 78 echo " will be bound." 79 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 80 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 81 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 82 echo " By default the current user will be used." 83 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 84 echo " bind devices to the given driver." 85 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 86 echo "PCI_BLOCK_SYNC_ON_RESET" 87 echo " If set in the environment, the attempt to wait for block devices associated" 88 echo " with given PCI device will be made upon reset" 89 echo "UNBIND_ENTIRE_IOMMU_GROUP" 90 echo " If set, all devices from nvme's iommu group will be unbound from their drivers." 91 echo " Use with caution." 92 echo "DEV_TYPE" 93 echo " Perform action only against selected type of devices. Supported:" 94 echo " IOAT|DSA|IAA|VIRTIO|VMD|NVME." 95 echo " Default is to select all types." 96 exit 0 97} 98 99# In monolithic kernels the lsmod won't work. So 100# back that with a /sys/modules. We also check 101# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 102# contain needed info (like in Fedora-like OS). 103function check_for_driver() { 104 if [[ -z $1 ]]; then 105 return 0 106 fi 107 108 if lsmod | grep -q ${1//-/_}; then 109 return 1 110 fi 111 112 if [[ -d /sys/module/${1} || -d \ 113 /sys/module/${1//-/_} || -d \ 114 /sys/bus/pci/drivers/${1} || -d \ 115 /sys/bus/pci/drivers/${1//-/_} ]]; then 116 return 2 117 fi 118 return 0 119} 120 121function check_for_driver_freebsd() { 122 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 123 local search_paths path driver 124 IFS=";" read -ra search_paths < <(kldconfig -rU) 125 126 for driver in contigmem.ko nic_uio.ko; do 127 for path in "${search_paths[@]}"; do 128 [[ -f $path/$driver ]] && continue 2 129 done 130 return 1 131 done 132 return 0 133} 134 135function pci_dev_echo() { 136 local bdf="$1" 137 shift 138 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 139} 140 141function probe_driver() { 142 local bdf=$1 143 local driver_name=$2 144 old_driver_name=${pci_bus_driver["$bdf"]:-no driver} 145 146 if [[ $driver_name == "$old_driver_name" ]]; then 147 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 148 return 0 149 fi 150 151 if [[ $old_driver_name != "no driver" ]]; then 152 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 153 fi 154 155 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 156 157 if [[ $driver_name == "none" ]]; then 158 return 0 159 fi 160 161 local probe_attempts=0 162 echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override" 163 while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do 164 pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)" 165 sleep 0.5 166 done 2> /dev/null 167 168 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 169 170 if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then 171 pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting" 172 return 1 173 fi 174} 175 176function linux_bind_driver() { 177 local bdf="$1" 178 local driver_name="$2" 179 180 probe_driver "$bdf" "$driver_name" 181 182 local iommu_group=${pci_iommu_groups["$bdf"]} 183 if [ -e "/dev/vfio/$iommu_group" ]; then 184 if [ -n "$TARGET_USER" ]; then 185 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 186 fi 187 fi 188 189 local iommug=("${!iommu_groups[iommu_group]}") 190 local _bdf _driver 191 if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then 192 pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!" 193 for _bdf in "${iommug[@]}"; do 194 [[ $_bdf == "$bdf" ]] && continue 195 _driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/} 196 if [[ $_driver == "$driver_name" ]]; then 197 continue 198 fi 199 # See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device() 200 pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver:-no driver})" 201 pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound" 202 if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then 203 pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}" 204 pci_bus_driver["${_bdf##*/}"]=$_driver 205 probe_driver "${_bdf##*/}" none 206 fi 207 done 208 fi 209 210} 211 212function linux_unbind_driver() { 213 local bdf="$1" 214 local old_driver_name=${pci_bus_driver["$bdf"]:-no driver} 215 216 if [[ $old_driver_name == "no driver" ]]; then 217 pci_dev_echo "$bdf" "Not bound to any driver" 218 return 0 219 fi 220 221 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 222 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 223 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 224 fi 225 226 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 227} 228 229function linux_hugetlbfs_mounts() { 230 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 231} 232 233function get_block_dev_from_bdf() { 234 local bdf=$1 235 local block blocks=() ctrl sub 236 237 for block in /sys/block/!(nvme*); do 238 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 239 blocks+=("${block##*/}") 240 fi 241 done 242 243 blocks+=($(get_block_dev_from_nvme "$bdf")) 244 245 printf '%s\n' "${blocks[@]}" 246} 247 248function get_block_dev_from_nvme() { 249 local bdf=$1 block ctrl sub 250 251 for ctrl in /sys/class/nvme/nvme*; do 252 [[ -e $ctrl/address && $(< "$ctrl/address") == "$bdf" ]] || continue 253 sub=$(< "$ctrl/subsysnqn") && break 254 done 255 256 [[ -n $sub ]] || return 0 257 258 for block in /sys/block/nvme*; do 259 [[ -e $block/hidden && $(< "$block/hidden") == 1 ]] && continue 260 [[ $(< "$block/device/subsysnqn") == "$sub" ]] && echo "${block##*/}" 261 done 262} 263 264function get_used_bdf_block_devs() { 265 local bdf=$1 266 local blocks block blockp dev mount holder 267 local used 268 269 hash lsblk &> /dev/null || return 1 270 blocks=($(get_block_dev_from_bdf "$bdf")) 271 272 for block in "${blocks[@]}"; do 273 # Check if the device is hold by some other, regardless if it's mounted 274 # or not. 275 for holder in "/sys/class/block/$block"*/holders/*; do 276 [[ -e $holder ]] || continue 277 blockp=${holder%/holders*} blockp=${blockp##*/} 278 if [[ -e $holder/slaves/$blockp ]]; then 279 used+=("holder@$blockp:${holder##*/}") 280 fi 281 done 282 while read -r dev mount; do 283 if [[ -e $mount ]]; then 284 used+=("mount@$block:$dev") 285 fi 286 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 287 if ((${#used[@]} == 0)); then 288 # Make sure we check if there's any valid data present on the target device 289 # regardless if it's being actively used or not. This is mainly done to make 290 # sure we don't miss more complex setups like ZFS pools, etc. 291 if block_in_use "$block" > /dev/null; then 292 used+=("data@$block") 293 fi 294 fi 295 done 296 297 if ((${#used[@]} > 0)); then 298 printf '%s\n' "${used[@]}" 299 fi 300} 301 302function collect_devices() { 303 local mode=$1 in_use 304 305 map_supported_devices "$DEV_TYPE" 306 307 for bdf in "${!all_devices_d[@]}"; do 308 in_use=0 309 if [[ $mode != status ]]; then 310 if ! pci_can_use "$bdf"; then 311 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 312 in_use=1 313 fi 314 fi 315 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 316 if ! verify_bdf_block_devs "$bdf"; then 317 in_use=1 318 fi 319 fi 320 if [[ -n ${vmd_d["$bdf"]} ]]; then 321 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 322 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 323 in_use=1 324 elif ((vmd_nvme_count["$bdf"] > 0)) && [[ $DRIVER_OVERRLDE != none && $mode == config ]]; then 325 cat <<- MESSAGE 326 Binding new driver to VMD device with NVMe SSDs attached to the kernel: 327 ${!vmd_nvme_d["$bdf"]} 328 The binding process may go faster if you first run this script with 329 DRIVER_OVERRIDE="none" to unbind only the NVMe SSDs, and then run 330 again to unbind the VMD devices. 331 MESSAGE 332 fi 333 fi 334 # Update in-use for each bdf. Default from the map_supported_devices() is 0 == "not used" 335 local -n type_ref=${all_devices_type_d["$bdf"]}_d 336 type_ref["$bdf"]=$in_use 337 all_devices_d["$bdf"]=$in_use 338 done 339 340 # Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are 341 # any skip them since they won't be usable by SPDK without moving the entire VMD ctrl 342 # away from the kernel first. That said, allow to touch the nvmes in case user requested 343 # all devices to be unbound from any driver or if dedicated override flag was set. 344 [[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0 345 346 for bdf in "${!nvme_d[@]}"; do 347 is_nvme_iommu_shared_with_vmd "$bdf" || continue 348 nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1 349 pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})" 350 done 351 352 return 0 353} 354 355function collect_driver() { 356 local bdf=$1 357 local drivers driver 358 359 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 360 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 361 # Pick first entry in case multiple aliases are bound to a driver. 362 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 363 driver=${driver##*/} 364 else 365 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 366 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 367 [[ -n ${dsa_d["$bdf"]} ]] && driver=idxd 368 [[ -n ${iaa_d["$bdf"]} ]] && driver=idxd 369 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 370 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 371 fi 2> /dev/null 372 echo "$driver" 373} 374 375function verify_bdf_block_devs() { 376 local bdf=$1 377 local blknames 378 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 379 380 if ((${#blknames[@]} > 0)); then 381 local IFS="," 382 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 383 return 1 384 fi 385} 386 387function configure_linux_pci() { 388 local driver_path="" 389 driver_name="" 390 igb_uio_fallback="" 391 392 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 393 # igb_uio is a common driver to override with and it depends on uio. 394 modprobe uio || true 395 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 396 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 397 fi 398 fi 399 400 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 401 driver_name=none 402 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 403 driver_path="$DRIVER_OVERRIDE" 404 driver_name="${DRIVER_OVERRIDE##*/}" 405 # modprobe and the sysfs don't use the .ko suffix. 406 driver_name=${driver_name%.ko} 407 # path = name -> there is no path 408 if [[ "$driver_path" = "$driver_name" ]]; then 409 driver_path="" 410 fi 411 elif is_iommu_enabled; then 412 driver_name=vfio-pci 413 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 414 # should be done automatically by modprobe since this particular module should 415 # be a part of vfio-pci dependencies, however, on some distros, it seems that 416 # it's not the case. See #1689. 417 if modinfo vfio_iommu_type1 > /dev/null; then 418 modprobe vfio_iommu_type1 419 fi 420 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 421 driver_name=uio_pci_generic 422 elif [[ -e $igb_uio_fallback ]]; then 423 driver_path="$igb_uio_fallback" 424 driver_name="igb_uio" 425 echo "WARNING: uio_pci_generic not detected - using $driver_name" 426 else 427 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 428 return 1 429 fi 430 431 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 432 if [[ $driver_name != "none" ]]; then 433 if [[ -n "$driver_path" ]]; then 434 insmod $driver_path || true 435 else 436 modprobe $driver_name 437 fi 438 fi 439 440 for bdf in "${!all_devices_d[@]}"; do 441 if ((all_devices_d["$bdf"] == 0)); then 442 if [[ -n ${nvme_d["$bdf"]} ]]; then 443 # Some nvme controllers may take significant amount of time while being 444 # unbound from the driver. Put that task into background to speed up the 445 # whole process. Currently this is done only for the devices bound to the 446 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 447 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 448 linux_bind_driver "$bdf" "$driver_name" & 449 else 450 linux_bind_driver "$bdf" "$driver_name" 451 fi 452 fi 453 done 454 wait 455 456 echo "1" > "/sys/bus/pci/rescan" 457} 458 459function cleanup_linux() { 460 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 461 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 462 463 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 464 if [[ -d $XDG_RUNTIME_DIR ]]; then 465 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 466 fi 467 468 for dir in "${dirs_to_clean[@]}"; do 469 files_to_clean+=("$dir/"*) 470 done 471 file_locks+=(/var/tmp/spdk_pci_lock*) 472 file_locks+=(/var/tmp/spdk_cpu_lock*) 473 474 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 475 files_to_clean+=("${file_locks[@]}") 476 477 # This may fail in case path that readlink attempts to resolve suddenly 478 # disappears (as it may happen with terminating processes). 479 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 480 481 if ((${#opened_files[@]} == 0)); then 482 echo "Can't get list of opened files!" 483 exit 1 484 fi 485 486 echo 'Cleaning' 487 for f in "${files_to_clean[@]}"; do 488 [[ -e $f ]] || continue 489 if [[ ${opened_files[*]} != *"$f"* ]]; then 490 echo "Removing: $f" 491 rm $f 492 else 493 echo "Still open: $f" 494 fi 495 done 496 497 for dir in "${dirs_to_clean[@]}"; do 498 [[ -d $dir ]] || continue 499 if [[ ${opened_files[*]} != *"$dir"* ]]; then 500 echo "Removing: $dir" 501 rmdir $dir 502 else 503 echo "Still open: $dir" 504 fi 505 done 506 echo "Clean" 507} 508 509check_hugepages_alloc() { 510 local hp_int=$1 511 local allocated_hugepages 512 513 allocated_hugepages=$(< "$hp_int") 514 515 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 516 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 517 return 0 518 fi 519 520 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 521 522 allocated_hugepages=$(< "$hp_int") 523 if ((allocated_hugepages < NRHUGE)); then 524 cat <<- ERROR 525 526 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 527 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 528 ERROR 529 return 1 530 fi 531} 532 533clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 534 535configure_linux_hugepages() { 536 local node system_nodes 537 local nodes_to_use nodes_hp 538 539 if [[ $CLEAR_HUGE == yes ]]; then 540 clear_hugepages 541 fi 542 543 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 544 clear_hugepages 545 check_hugepages_alloc /proc/sys/vm/nr_hugepages 546 return 0 547 fi 548 549 for node in /sys/devices/system/node/node*; do 550 [[ -e $node ]] || continue 551 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 552 done 553 554 if ((${#nodes[@]} == 0)); then 555 # No NUMA support? Fallback to common interface 556 check_hugepages_alloc /proc/sys/vm/nr_hugepages 557 return 0 558 fi 559 560 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 561 if ((${#nodes_to_use[@]} == 0)); then 562 nodes_to_use[0]=0 563 fi 564 565 # Align indexes with node ids 566 for node in "${!nodes_to_use[@]}"; do 567 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 568 eval "${nodes_to_use[node]}" 569 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 570 nodes_hp[nodes_to_use[node]]=$NRHUGE 571 fi 572 done 573 574 for node in "${!nodes_hp[@]}"; do 575 if [[ -z ${nodes[node]} ]]; then 576 echo "Node $node doesn't exist, ignoring" >&2 577 continue 578 fi 579 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 580 done 581} 582 583function configure_linux() { 584 configure_linux_pci 585 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 586 587 if [ -z "$hugetlbfs_mounts" ]; then 588 hugetlbfs_mounts=/mnt/huge 589 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 590 mkdir -p "$hugetlbfs_mounts" 591 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 592 fi 593 594 configure_linux_hugepages 595 596 if [ "$driver_name" = "vfio-pci" ]; then 597 if [ -n "$TARGET_USER" ]; then 598 for mount in $hugetlbfs_mounts; do 599 chown "$TARGET_USER" "$mount" 600 chmod g+w "$mount" 601 done 602 603 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 604 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 605 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 606 cat <<- MEMLOCK 607 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 608 609 This is the maximum amount of memory you will be 610 able to use with DPDK and VFIO if run as user "$TARGET_USER". 611 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 612 MEMLOCK 613 if ((MEMLOCK_AMNT < 65536)); then 614 echo "" 615 echo "## WARNING: memlock limit is less than 64MB" 616 echo -n "## DPDK with VFIO may not be able to initialize " 617 echo "if run as user \"$TARGET_USER\"." 618 fi 619 fi 620 fi 621 fi 622 623 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 624 # Some distros build msr as a module. Make sure it's loaded to ensure 625 # DPDK can easily figure out the TSC rate rather than relying on 100ms 626 # sleeps. 627 modprobe msr &> /dev/null || true 628 fi 629} 630 631function reset_linux_pci() { 632 # virtio 633 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 634 # Requires some more investigation - for example, some kernels do not seem to have 635 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 636 # underscore vs. dash right in the virtio_scsi name. 637 modprobe virtio-pci || true 638 for bdf in "${!all_devices_d[@]}"; do 639 ((all_devices_d["$bdf"] == 0)) || continue 640 641 driver=$(collect_driver "$bdf") 642 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 643 linux_bind_driver "$bdf" "$driver" 644 else 645 linux_unbind_driver "$bdf" 646 fi 647 done 648 649 echo "1" > "/sys/bus/pci/rescan" 650} 651 652function reset_linux() { 653 reset_linux_pci 654 for mount in $(linux_hugetlbfs_mounts); do 655 for hp in "$mount"/spdk*map_*; do 656 flock -n "$hp" true && rm -f "$hp" 657 done 658 done 659 rm -f /run/.spdk* 660} 661 662function status_linux() { 663 echo "Hugepages" >&2 664 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 665 666 numa_nodes=0 667 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 668 numa_nodes=$((numa_nodes + 1)) 669 free_pages=$(cat $path/free_hugepages) 670 all_pages=$(cat $path/nr_hugepages) 671 672 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 673 674 node=${BASH_REMATCH[1]} 675 huge_size=${BASH_REMATCH[2]} 676 677 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 678 done 679 680 # fall back to system-wide hugepages 681 if [ "$numa_nodes" = "0" ]; then 682 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 683 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 684 node="-" 685 huge_size="$HUGEPGSZ" 686 687 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 688 fi 689 690 printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 691 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 692 693 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 694 695 for bdf in "${sorted_bdfs[@]}"; do 696 driver=${pci_bus_driver["$bdf"]} 697 if [ "$numa_nodes" = "0" ]; then 698 node="-" 699 else 700 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 701 if ((node == -1)); then 702 node=unknown 703 fi 704 fi 705 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 706 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 707 else 708 name="-" 709 fi 710 711 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 712 blknames=($(get_block_dev_from_bdf "$bdf")) 713 else 714 blknames=("-") 715 fi 716 717 desc="" 718 desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}} 719 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 720 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 721 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 722 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 723 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 724 725 printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 726 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 727 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 728 done 729} 730 731function status_freebsd() { 732 local pci 733 734 status_print() ( 735 local type=$1 736 local dev driver 737 738 shift 739 740 for pci; do 741 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 742 "$type" \ 743 "$pci" \ 744 "${pci_ids_vendor["$pci"]}" \ 745 "${pci_ids_device["$pci"]}" \ 746 "${pci_bus_driver["$pci"]}" 747 done | sort -k2,2 748 ) 749 750 local contigmem=present 751 local contigmem_buffer_size 752 local contigmem_num_buffers 753 754 if ! kldstat -q -m contigmem; then 755 contigmem="not present" 756 fi 757 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 758 contigmem_buffer_size="not set" 759 fi 760 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 761 contigmem_num_buffers="not set" 762 fi 763 764 cat <<- BSD_INFO 765 Contigmem ($contigmem) 766 Buffer Size: $contigmem_buffer_size 767 Num Buffers: $contigmem_num_buffers 768 769 BSD_INFO 770 771 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 772 "Type" "BDF" "Vendor" "Device" "Driver" >&2 773 774 status_print "NVMe" "${!nvme_d[@]}" 775 status_print "I/OAT" "${!ioat_d[@]}" 776 status_print "DSA" "${!dsa_d[@]}" 777 status_print "IAA" "${!iaa_d[@]}" 778 status_print "VMD" "${!vmd_d[@]}" 779} 780 781function configure_freebsd_pci() { 782 local BDFS 783 784 BDFS+=("${!nvme_d[@]}") 785 BDFS+=("${!ioat_d[@]}") 786 BDFS+=("${!dsa_d[@]}") 787 BDFS+=("${!iaa_d[@]}") 788 BDFS+=("${!vmd_d[@]}") 789 790 # Drop the domain part from all the addresses 791 BDFS=("${BDFS[@]#*:}") 792 793 local IFS="," 794 kldunload nic_uio.ko || true 795 kenv hw.nic_uio.bdfs="${BDFS[*]}" 796 kldload nic_uio.ko 797} 798 799function configure_freebsd() { 800 if ! check_for_driver_freebsd; then 801 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 802 return 1 803 fi 804 configure_freebsd_pci 805 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 806 # previous value, unload contigmem so that we can reload with the new value. 807 if kldstat -q -m contigmem; then 808 # contigmem may be loaded, but the kernel environment doesn't have to 809 # be necessarily set at this point. If it isn't, kenv will fail to 810 # pick up the hw. options. Handle it. 811 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 812 contigmem_num_buffers=-1 813 fi 2> /dev/null 814 if ((contigmem_num_buffers != HUGEMEM / 256)); then 815 kldunload contigmem.ko 816 fi 817 fi 818 if ! kldstat -q -m contigmem; then 819 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 820 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 821 kldload contigmem.ko 822 fi 823} 824 825function reset_freebsd() { 826 kldunload contigmem.ko || true 827 kldunload nic_uio.ko || true 828} 829 830function set_hp() { 831 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 832 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 833 unset -v HUGEPGSZ 834 fi 835 836 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 837 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 838 NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 839} 840 841kmsg "spdk: $0 $* (start)" 842 843CMD=reset cache_pci_bus 844 845mode=$1 846 847if [ -z "$mode" ]; then 848 mode="config" 849fi 850 851: ${HUGEMEM:=2048} 852: ${PCI_ALLOWED:=""} 853: ${PCI_BLOCKED:=""} 854 855if [ -n "$NVME_ALLOWED" ]; then 856 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 857fi 858 859if [ -n "$SKIP_PCI" ]; then 860 PCI_ALLOWED="none" 861fi 862 863if [ -z "$TARGET_USER" ]; then 864 TARGET_USER="$SUDO_USER" 865 if [ -z "$TARGET_USER" ]; then 866 TARGET_USER=$(logname 2> /dev/null) || true 867 fi 868fi 869 870collect_devices "$mode" 871 872if [[ $os == Linux ]]; then 873 set_hp 874fi 875 876if [[ $mode == interactive ]]; then 877 source "$rootdir/scripts/common/setup/interactive.sh" 878 main_menu "$2" || exit 0 879fi 880 881if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 882 # Note that this will wait only for the first block device attached to 883 # a given storage controller. For nvme this may miss some of the devs 884 # in case multiple namespaces are being in place. 885 # FIXME: Wait for nvme controller(s) to be in live state and determine 886 # number of configured namespaces, build list of potential block devs 887 # and pass them to sync_dev_uevents. Is it worth the effort? 888 bdfs_to_wait_for=() 889 for bdf in "${!all_devices_d[@]}"; do 890 ((all_devices_d["$bdf"] == 0)) || continue 891 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 892 [[ $(collect_driver "$bdf") != "${pci_bus_driver["$bdf"]}" ]] || continue 893 bdfs_to_wait_for+=("$bdf") 894 fi 895 done 896 if ((${#bdfs_to_wait_for[@]} > 0)); then 897 echo "Waiting for block devices as requested" 898 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 899 "$rootdir/scripts/sync_dev_uevents.sh" \ 900 block/disk \ 901 "${bdfs_to_wait_for[@]}" & 902 sync_pid=$! 903 fi 904fi 905 906if [[ $os == Linux ]]; then 907 if [ "$mode" == "config" ]; then 908 configure_linux 909 elif [ "$mode" == "cleanup" ]; then 910 cleanup_linux 911 clear_hugepages 912 elif [ "$mode" == "reset" ]; then 913 reset_linux 914 elif [ "$mode" == "status" ]; then 915 status_linux 916 elif [ "$mode" == "help" ]; then 917 usage $0 918 else 919 usage $0 "Invalid argument '$mode'" 920 fi 921else 922 if [ "$mode" == "config" ]; then 923 configure_freebsd 924 elif [ "$mode" == "reset" ]; then 925 reset_freebsd 926 elif [ "$mode" == "cleanup" ]; then 927 echo "setup.sh cleanup function not yet supported on $os" 928 elif [ "$mode" == "status" ]; then 929 status_freebsd 930 elif [ "$mode" == "help" ]; then 931 usage $0 932 else 933 usage $0 "Invalid argument '$mode'" 934 fi 935fi 936 937if [[ -e /proc/$sync_pid/status ]]; then 938 wait "$sync_pid" 939fi 940 941kmsg "spdk: $0 $* (done)" 942