1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2016 Intel Corporation 4# All rights reserved. 5# 6set -e 7shopt -s nullglob extglob 8 9os=$(uname -s) 10 11if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14fi 15 16rootdir=$(readlink -f $(dirname $0))/.. 17source "$rootdir/scripts/common.sh" 18 19function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|interactive|help]" 22 else 23 options="[config|reset|interactive|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "interactive Executes script in interactive mode." 48 echo "help Print this help message." 49 echo 50 echo "The following environment variables can be specified." 51 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 52 echo " For NUMA systems, the hugepages will be distributed on node0 by" 53 echo " default." 54 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 55 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 56 echo " Uses kernel's default for hugepages size." 57 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 58 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 59 echo " separated with comas. By default, NRHUGE will be applied on each node." 60 echo " Hugepages can be defined per node with e.g.:" 61 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 62 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 63 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 64 echo " setting is used." 65 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 66 echo " number of requested hugepages is lower from what's already" 67 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 68 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 69 echo " be made prior to allocation". 70 echo "PCI_ALLOWED" 71 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 72 echo " Each device must be specified as a full PCI address." 73 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 74 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 75 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 76 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 77 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 78 echo " will be bound." 79 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 80 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 81 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 82 echo " By default the current user will be used." 83 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 84 echo " bind devices to the given driver." 85 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 86 echo "PCI_BLOCK_SYNC_ON_RESET" 87 echo " If set in the environment, the attempt to wait for block devices associated" 88 echo " with given PCI device will be made upon reset" 89 echo "UNBIND_ENTIRE_IOMMU_GROUP" 90 echo " If set, all devices from nvme's iommu group will be unbound from their drivers." 91 echo " Use with caution." 92 echo "DEV_TYPE" 93 echo " Perform action only against selected type of devices. Supported:" 94 echo " IOAT|DSA|IAA|VIRTIO|VMD|NVME." 95 echo " Default is to select all types." 96 echo "FORCE_NIC_UIO_REBIND" 97 echo " When set to 'yes', an attempt to reload nic_uio will be made regardless" 98 echo " of the kernel environment. Applicable only under FreeBSD." 99 exit 0 100} 101 102# In monolithic kernels the lsmod won't work. So 103# back that with a /sys/modules. We also check 104# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 105# contain needed info (like in Fedora-like OS). 106function check_for_driver() { 107 if [[ -z $1 ]]; then 108 return 0 109 fi 110 111 if lsmod | grep -q ${1//-/_}; then 112 return 1 113 fi 114 115 if [[ -d /sys/module/${1} || 116 -d /sys/module/${1//-/_} || 117 -d /sys/bus/pci/drivers/${1} || 118 -d /sys/bus/pci/drivers/${1//-/_} ]]; then 119 return 2 120 fi 121 return 0 122} 123 124function check_for_driver_freebsd() { 125 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 126 local search_paths path driver 127 IFS=";" read -ra search_paths < <(kldconfig -rU) 128 129 for driver in contigmem.ko nic_uio.ko; do 130 for path in "${search_paths[@]}"; do 131 [[ -f $path/$driver ]] && continue 2 132 done 133 return 1 134 done 135 return 0 136} 137 138function pci_dev_echo() { 139 local bdf="$1" 140 shift 141 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 142} 143 144function probe_driver() { 145 local bdf=$1 146 local driver_name=$2 147 old_driver_name=${pci_bus_driver["$bdf"]:-no driver} 148 149 if [[ $driver_name == "$old_driver_name" ]]; then 150 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 151 return 0 152 fi 153 154 if [[ $old_driver_name != "no driver" ]]; then 155 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 156 fi 157 158 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 159 160 if [[ $driver_name == "none" ]]; then 161 return 0 162 fi 163 164 local probe_attempts=0 165 echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override" 166 while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do 167 pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)" 168 sleep 0.5 169 done 2> /dev/null 170 171 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 172 173 if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then 174 pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting" 175 return 1 176 fi 177} 178 179function linux_bind_driver() { 180 local bdf="$1" 181 local driver_name="$2" 182 183 probe_driver "$bdf" "$driver_name" 184 185 local iommu_group=${pci_iommu_groups["$bdf"]} 186 if [ -e "/dev/vfio/$iommu_group" ]; then 187 if [ -n "$TARGET_USER" ]; then 188 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 189 fi 190 fi 191 192 local iommug=("${!iommu_groups[iommu_group]}") 193 local _bdf _driver 194 if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then 195 pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!" 196 for _bdf in "${iommug[@]}"; do 197 [[ $_bdf == "$bdf" ]] && continue 198 _driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/} 199 if [[ $_driver == "$driver_name" ]]; then 200 continue 201 fi 202 # See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device() 203 pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver:-no driver})" 204 pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound" 205 if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then 206 pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}" 207 pci_bus_driver["${_bdf##*/}"]=$_driver 208 probe_driver "${_bdf##*/}" none 209 fi 210 done 211 fi 212 213} 214 215function linux_unbind_driver() { 216 local bdf="$1" 217 local old_driver_name=${pci_bus_driver["$bdf"]:-no driver} 218 219 if [[ $old_driver_name == "no driver" ]]; then 220 pci_dev_echo "$bdf" "Not bound to any driver" 221 return 0 222 fi 223 224 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 225 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 226 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 227 fi 228 229 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 230} 231 232function linux_hugetlbfs_mounts() { 233 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 234} 235 236function get_block_dev_from_bdf() { 237 local bdf=$1 238 local block blocks=() ctrl sub 239 240 for block in /sys/block/!(nvme*); do 241 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 242 blocks+=("${block##*/}") 243 fi 244 done 245 246 blocks+=($(get_block_dev_from_nvme "$bdf")) 247 248 printf '%s\n' "${blocks[@]}" 249} 250 251function get_block_dev_from_nvme() { 252 local bdf=$1 block ctrl sub 253 254 for ctrl in /sys/class/nvme/nvme*; do 255 [[ -e $ctrl/address && $(< "$ctrl/address") == "$bdf" ]] || continue 256 sub=$(< "$ctrl/subsysnqn") && break 257 done 258 259 [[ -n $sub ]] || return 0 260 261 for block in /sys/block/nvme*; do 262 [[ -e $block/hidden && $(< "$block/hidden") == 1 ]] && continue 263 [[ $(< "$block/device/subsysnqn") == "$sub" ]] && echo "${block##*/}" 264 done 265} 266 267function get_used_bdf_block_devs() { 268 local bdf=$1 269 local blocks block blockp dev mount holder 270 local used 271 272 hash lsblk &> /dev/null || return 1 273 blocks=($(get_block_dev_from_bdf "$bdf")) 274 275 for block in "${blocks[@]}"; do 276 # Check if the device is hold by some other, regardless if it's mounted 277 # or not. 278 for holder in "/sys/class/block/$block"*/holders/*; do 279 [[ -e $holder ]] || continue 280 blockp=${holder%/holders*} blockp=${blockp##*/} 281 if [[ -e $holder/slaves/$blockp ]]; then 282 used+=("holder@$blockp:${holder##*/}") 283 fi 284 done 285 while read -r dev mount; do 286 if [[ -e $mount ]]; then 287 used+=("mount@$block:$dev") 288 fi 289 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 290 if ((${#used[@]} == 0)); then 291 # Make sure we check if there's any valid data present on the target device 292 # regardless if it's being actively used or not. This is mainly done to make 293 # sure we don't miss more complex setups like ZFS pools, etc. 294 if block_in_use "$block" > /dev/null; then 295 used+=("data@$block") 296 fi 297 fi 298 done 299 300 if ((${#used[@]} > 0)); then 301 printf '%s\n' "${used[@]}" 302 fi 303} 304 305function collect_devices() { 306 local mode=$1 in_use 307 308 map_supported_devices "$DEV_TYPE" 309 310 for bdf in "${!all_devices_d[@]}"; do 311 in_use=0 312 if [[ $mode != status ]]; then 313 if ! pci_can_use "$bdf"; then 314 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 315 in_use=1 316 fi 317 fi 318 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 319 if ! verify_bdf_block_devs "$bdf"; then 320 in_use=1 321 fi 322 fi 323 if [[ -n ${vmd_d["$bdf"]} ]]; then 324 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 325 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 326 in_use=1 327 elif ((vmd_nvme_count["$bdf"] > 0)) && [[ $DRIVER_OVERRLDE != none && $mode == config ]]; then 328 cat <<- MESSAGE 329 Binding new driver to VMD device with NVMe SSDs attached to the kernel: 330 ${!vmd_nvme_d["$bdf"]} 331 The binding process may go faster if you first run this script with 332 DRIVER_OVERRIDE="none" to unbind only the NVMe SSDs, and then run 333 again to unbind the VMD devices. 334 MESSAGE 335 fi 336 fi 337 if [[ -n ${dsa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then 338 pci_dev_echo "$bdf" "Skipping not allowed DSA controller at $bdf" 339 in_use=1 340 fi 341 if [[ -n ${iaa_d["$bdf"]} ]] && [[ $PCI_ALLOWED != *"$bdf"* ]]; then 342 pci_dev_echo "$bdf" "Skipping not allowed IAA controller at $bdf" 343 in_use=1 344 fi 345 # Update in-use for each bdf. Default from the map_supported_devices() is 0 == "not used" 346 local -n type_ref=${all_devices_type_d["$bdf"]}_d 347 type_ref["$bdf"]=$in_use 348 all_devices_d["$bdf"]=$in_use 349 done 350 351 # Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are 352 # any skip them since they won't be usable by SPDK without moving the entire VMD ctrl 353 # away from the kernel first. That said, allow to touch the nvmes in case user requested 354 # all devices to be unbound from any driver or if dedicated override flag was set. 355 [[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0 356 357 for bdf in "${!nvme_d[@]}"; do 358 is_nvme_iommu_shared_with_vmd "$bdf" || continue 359 nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1 360 pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})" 361 done 362 363 get_unsupported_nic_uio_hw 364 365 return 0 366} 367 368function collect_driver() { 369 local bdf=$1 370 local drivers driver 371 372 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 373 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 374 # Pick first entry in case multiple aliases are bound to a driver. 375 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 376 driver=${driver##*/} 377 else 378 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 379 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 380 [[ -n ${dsa_d["$bdf"]} ]] && driver=idxd 381 [[ -n ${iaa_d["$bdf"]} ]] && driver=idxd 382 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 383 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 384 fi 2> /dev/null 385 echo "$driver" 386} 387 388function verify_bdf_block_devs() { 389 local bdf=$1 390 local blknames 391 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 392 393 if ((${#blknames[@]} > 0)); then 394 local IFS="," 395 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 396 return 1 397 fi 398} 399 400function configure_linux_pci() { 401 local driver_path="" 402 driver_name="" 403 igb_uio_fallback="" 404 405 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 406 # igb_uio is a common driver to override with and it depends on uio. 407 modprobe uio || true 408 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 409 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 410 fi 411 fi 412 413 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 414 driver_name=none 415 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 416 driver_path="$DRIVER_OVERRIDE" 417 driver_name="${DRIVER_OVERRIDE##*/}" 418 # modprobe and the sysfs don't use the .ko suffix. 419 driver_name=${driver_name%.ko} 420 # path = name -> there is no path 421 if [[ "$driver_path" = "$driver_name" ]]; then 422 driver_path="" 423 fi 424 elif is_iommu_enabled; then 425 driver_name=vfio-pci 426 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 427 # should be done automatically by modprobe since this particular module should 428 # be a part of vfio-pci dependencies, however, on some distros, it seems that 429 # it's not the case. See #1689. 430 if modinfo vfio_iommu_type1 > /dev/null; then 431 modprobe vfio_iommu_type1 432 fi 433 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 434 driver_name=uio_pci_generic 435 elif [[ -e $igb_uio_fallback ]]; then 436 driver_path="$igb_uio_fallback" 437 driver_name="igb_uio" 438 echo "WARNING: uio_pci_generic not detected - using $driver_name" 439 else 440 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 441 return 1 442 fi 443 444 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 445 if [[ $driver_name != "none" ]]; then 446 if [[ -n "$driver_path" ]]; then 447 insmod $driver_path || true 448 else 449 modprobe $driver_name 450 fi 451 fi 452 453 for bdf in "${!all_devices_d[@]}"; do 454 if ((all_devices_d["$bdf"] == 0)); then 455 if [[ -n ${nvme_d["$bdf"]} ]]; then 456 # Some nvme controllers may take significant amount of time while being 457 # unbound from the driver. Put that task into background to speed up the 458 # whole process. Currently this is done only for the devices bound to the 459 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 460 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 461 linux_bind_driver "$bdf" "$driver_name" & 462 else 463 linux_bind_driver "$bdf" "$driver_name" 464 fi 465 fi 466 done 467 wait 468 469 echo "1" > "/sys/bus/pci/rescan" 470} 471 472function cleanup_linux() { 473 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 474 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 475 476 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 477 if [[ -d $XDG_RUNTIME_DIR ]]; then 478 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 479 fi 480 481 for dir in "${dirs_to_clean[@]}"; do 482 files_to_clean+=("$dir/"*) 483 done 484 file_locks+=(/var/tmp/spdk_pci_lock*) 485 file_locks+=(/var/tmp/spdk_cpu_lock*) 486 487 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 488 files_to_clean+=("${file_locks[@]}") 489 490 # This may fail in case path that readlink attempts to resolve suddenly 491 # disappears (as it may happen with terminating processes). 492 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 493 494 if ((${#opened_files[@]} == 0)); then 495 echo "Can't get list of opened files!" 496 exit 1 497 fi 498 499 echo 'Cleaning' 500 for f in "${files_to_clean[@]}"; do 501 [[ -e $f ]] || continue 502 if [[ ${opened_files[*]} != *"$f"* ]]; then 503 echo "Removing: $f" 504 rm $f 505 else 506 echo "Still open: $f" 507 fi 508 done 509 510 for dir in "${dirs_to_clean[@]}"; do 511 [[ -d $dir ]] || continue 512 if [[ ${opened_files[*]} != *"$dir"* ]]; then 513 echo "Removing: $dir" 514 rmdir $dir 515 else 516 echo "Still open: $dir" 517 fi 518 done 519 echo "Clean" 520} 521 522check_hugepages_alloc() { 523 local hp_int=$1 524 local allocated_hugepages 525 526 allocated_hugepages=$(< "$hp_int") 527 528 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 529 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 530 return 0 531 fi 532 533 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 534 535 allocated_hugepages=$(< "$hp_int") 536 if ((allocated_hugepages < NRHUGE)); then 537 cat <<- ERROR 538 539 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 540 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 541 ERROR 542 return 1 543 fi 544} 545 546clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 547 548configure_linux_hugepages() { 549 local node system_nodes 550 local nodes_to_use nodes_hp 551 552 if [[ $CLEAR_HUGE == yes ]]; then 553 clear_hugepages 554 fi 555 556 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 557 clear_hugepages 558 check_hugepages_alloc /proc/sys/vm/nr_hugepages 559 return 0 560 fi 561 562 for node in /sys/devices/system/node/node*; do 563 [[ -e $node ]] || continue 564 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 565 done 566 567 if ((${#nodes[@]} == 0)); then 568 # No NUMA support? Fallback to common interface 569 check_hugepages_alloc /proc/sys/vm/nr_hugepages 570 return 0 571 fi 572 573 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 574 if ((${#nodes_to_use[@]} == 0)); then 575 nodes_to_use[0]=0 576 fi 577 578 # Align indexes with node ids 579 for node in "${!nodes_to_use[@]}"; do 580 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 581 eval "${nodes_to_use[node]}" 582 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 583 nodes_hp[nodes_to_use[node]]=$NRHUGE 584 fi 585 done 586 587 for node in "${!nodes_hp[@]}"; do 588 if [[ -z ${nodes[node]} ]]; then 589 echo "Node $node doesn't exist, ignoring" >&2 590 continue 591 fi 592 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 593 done 594} 595 596function configure_linux() { 597 configure_linux_pci 598 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 599 600 if [ -z "$hugetlbfs_mounts" ]; then 601 hugetlbfs_mounts=/mnt/huge 602 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 603 mkdir -p "$hugetlbfs_mounts" 604 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 605 fi 606 607 configure_linux_hugepages 608 609 if [ "$driver_name" = "vfio-pci" ]; then 610 if [ -n "$TARGET_USER" ]; then 611 for mount in $hugetlbfs_mounts; do 612 chown "$TARGET_USER" "$mount" 613 chmod g+w "$mount" 614 done 615 616 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 617 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 618 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 619 cat <<- MEMLOCK 620 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 621 622 This is the maximum amount of memory you will be 623 able to use with DPDK and VFIO if run as user "$TARGET_USER". 624 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 625 MEMLOCK 626 if ((MEMLOCK_AMNT < 65536)); then 627 echo "" 628 echo "## WARNING: memlock limit is less than 64MB" 629 echo -n "## DPDK with VFIO may not be able to initialize " 630 echo "if run as user \"$TARGET_USER\"." 631 fi 632 fi 633 fi 634 fi 635 636 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 637 # Some distros build msr as a module. Make sure it's loaded to ensure 638 # DPDK can easily figure out the TSC rate rather than relying on 100ms 639 # sleeps. 640 modprobe msr &> /dev/null || true 641 fi 642} 643 644function reset_linux_pci() { 645 # virtio 646 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 647 # Requires some more investigation - for example, some kernels do not seem to have 648 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 649 # underscore vs. dash right in the virtio_scsi name. 650 modprobe virtio-pci || true 651 for bdf in "${!all_devices_d[@]}"; do 652 ((all_devices_d["$bdf"] == 0)) || continue 653 654 driver=$(collect_driver "$bdf") 655 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 656 linux_bind_driver "$bdf" "$driver" 657 else 658 linux_unbind_driver "$bdf" 659 fi 660 done 661 662 echo "1" > "/sys/bus/pci/rescan" 663} 664 665function reset_linux() { 666 reset_linux_pci 667 for mount in $(linux_hugetlbfs_mounts); do 668 for hp in "$mount"/spdk*map_*; do 669 flock -n "$hp" true && rm -f "$hp" 670 done 671 done 672 rm -f /run/.spdk* 673} 674 675function status_linux() { 676 echo "Hugepages" >&2 677 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 678 679 numa_nodes=0 680 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 681 numa_nodes=$((numa_nodes + 1)) 682 free_pages=$(cat $path/free_hugepages) 683 all_pages=$(cat $path/nr_hugepages) 684 685 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 686 687 node=${BASH_REMATCH[1]} 688 huge_size=${BASH_REMATCH[2]} 689 690 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 691 done 692 693 # fall back to system-wide hugepages 694 if [ "$numa_nodes" = "0" ]; then 695 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 696 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 697 node="-" 698 huge_size="$HUGEPGSZ" 699 700 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 701 fi 702 703 printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 704 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 705 706 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 707 708 for bdf in "${sorted_bdfs[@]}"; do 709 driver=${pci_bus_driver["$bdf"]} 710 if [ "$numa_nodes" = "0" ]; then 711 node="-" 712 else 713 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 714 if ((node == -1)); then 715 node=unknown 716 fi 717 fi 718 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 719 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 720 else 721 name="-" 722 fi 723 724 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 725 blknames=($(get_block_dev_from_bdf "$bdf")) 726 else 727 blknames=("-") 728 fi 729 730 desc="" 731 desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}} 732 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 733 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 734 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 735 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 736 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 737 738 printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 739 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 740 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 741 done 742} 743 744function status_freebsd() { 745 local pci 746 747 status_print() ( 748 local type=$1 749 local dev driver 750 751 shift 752 753 for pci; do 754 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 755 "$type" \ 756 "$pci" \ 757 "${pci_ids_vendor["$pci"]}" \ 758 "${pci_ids_device["$pci"]}" \ 759 "${pci_bus_driver["$pci"]}" 760 done | sort -k2,2 761 ) 762 763 local contigmem=present 764 local contigmem_buffer_size 765 local contigmem_num_buffers 766 767 if ! kldstat -q -m contigmem; then 768 contigmem="not present" 769 fi 770 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 771 contigmem_buffer_size="not set" 772 fi 773 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 774 contigmem_num_buffers="not set" 775 fi 776 777 cat <<- BSD_INFO 778 Contigmem ($contigmem) 779 Buffer Size: $contigmem_buffer_size 780 Num Buffers: $contigmem_num_buffers 781 782 BSD_INFO 783 784 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 785 "Type" "BDF" "Vendor" "Device" "Driver" >&2 786 787 status_print "NVMe" "${!nvme_d[@]}" 788 status_print "I/OAT" "${!ioat_d[@]}" 789 status_print "DSA" "${!dsa_d[@]}" 790 status_print "IAA" "${!iaa_d[@]}" 791 status_print "VMD" "${!vmd_d[@]}" 792} 793 794function configure_freebsd_pci() { 795 local BDFS 796 797 BDFS+=("$@") 798 799 if ((${#unsupported_nic_uio_hw[@]} > 0)) && [[ $FORCE_NIC_UIO_REBIND != yes ]]; then 800 warn_unsupported_nic_uio_hw 801 return 1 802 fi 803 804 BDFS+=("${unsupported_nic_uio_hw[@]}") 805 806 if kldstat -n nic_uio &> /dev/null; then 807 kldunload nic_uio.ko 808 fi 809 810 local IFS="," 811 kenv hw.nic_uio.bdfs="${BDFS[*]}" 812 kldload nic_uio.ko 813} 814 815function get_unsupported_nic_uio_hw() { 816 local bdfs bdf all_devices 817 local -g unsupported_nic_uio_hw 818 819 IFS="," read -ra bdfs < <(kenv hw.nic_uio.bdfs 2> /dev/null) || return 0 820 821 for bdf in "${bdfs[@]}"; do 822 grep -q "$bdf" <(printf '%s\n' "${!all_devices_d[@]}") || unsupported_nic_uio_hw+=("$bdf") 823 done 824 825 return 0 826} 827 828function warn_unsupported_nic_uio_hw() { 829 cat <<- NIC_UIO 830 831 WARNING: Unsupported devices detected in the nic_uio setup: 832 833 $(printf ' %s\n' "${unsupported_nic_uio_hw[@]}") 834 835 Remove them first or pass FORCE_NIC_UIO_REBIND=yes through the environment. 836 837 NIC_UIO 838} 839 840function configure_freebsd() { 841 _configure_freebsd "${!nvme_d[@]}" "${!ioat_d[@]}" "${!dsa_d[@]}" "${!iaa_d[@]}" "${!vmd_d[@]}" 842} 843 844function _configure_freebsd() { 845 if ! check_for_driver_freebsd; then 846 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 847 return 1 848 fi 849 configure_freebsd_pci "$@" 850 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 851 # previous value, unload contigmem so that we can reload with the new value. 852 if kldstat -q -m contigmem; then 853 # contigmem may be loaded, but the kernel environment doesn't have to 854 # be necessarily set at this point. If it isn't, kenv will fail to 855 # pick up the hw. options. Handle it. 856 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 857 contigmem_num_buffers=-1 858 fi 2> /dev/null 859 if ((contigmem_num_buffers != HUGEMEM / 256)); then 860 kldunload contigmem.ko 861 fi 862 fi 863 if ! kldstat -q -m contigmem; then 864 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 865 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 866 kldload contigmem.ko 867 fi 868} 869 870function reset_freebsd() { 871 # Don't reap the entire nic_uio setup in case there are unsupported devices in the kernel env 872 if ((${#unsupported_nic_uio_hw[@]} > 0)) && [[ $FORCE_NIC_UIO_REBIND != yes ]]; then 873 warn_unsupported_nic_uio_hw 874 return 1 875 fi 876 877 kldunload contigmem.ko || true 878 kldunload nic_uio.ko || true 879 880 if ((${#unsupported_nic_uio_hw[@]} > 0)); then 881 # HACK: try to be nice and recreate the setup but only with the unsupported devices 882 _unsupported_nic_uio_hw=("${unsupported_nic_uio_hw[@]}") unsupported_nic_uio_hw=() 883 _configure_freebsd "${_unsupported_nic_uio_hw[@]}" 884 fi 885} 886 887function set_hp() { 888 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 889 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 890 unset -v HUGEPGSZ 891 fi 892 893 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 894 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 895 NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 896} 897 898kmsg "spdk: $0 $* (start)" 899 900CMD=reset cache_pci_bus 901 902mode=$1 903 904if [ -z "$mode" ]; then 905 mode="config" 906fi 907 908: ${HUGEMEM:=2048} 909: ${PCI_ALLOWED:=""} 910: ${PCI_BLOCKED:=""} 911 912if [ -n "$NVME_ALLOWED" ]; then 913 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 914fi 915 916if [ -n "$SKIP_PCI" ]; then 917 PCI_ALLOWED="none" 918fi 919 920if [ -z "$TARGET_USER" ]; then 921 TARGET_USER="$SUDO_USER" 922 if [ -z "$TARGET_USER" ]; then 923 TARGET_USER=$(logname 2> /dev/null) || true 924 fi 925fi 926 927collect_devices "$mode" 928 929if [[ $os == Linux ]]; then 930 set_hp 931fi 932 933if [[ $mode == interactive ]]; then 934 source "$rootdir/scripts/common/setup/interactive.sh" 935 main_menu "$2" || exit 0 936fi 937 938if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 939 # Note that this will wait only for the first block device attached to 940 # a given storage controller. For nvme this may miss some of the devs 941 # in case multiple namespaces are being in place. 942 # FIXME: Wait for nvme controller(s) to be in live state and determine 943 # number of configured namespaces, build list of potential block devs 944 # and pass them to sync_dev_uevents. Is it worth the effort? 945 bdfs_to_wait_for=() 946 for bdf in "${!all_devices_d[@]}"; do 947 ((all_devices_d["$bdf"] == 0)) || continue 948 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 949 [[ $(collect_driver "$bdf") != "${pci_bus_driver["$bdf"]}" ]] || continue 950 bdfs_to_wait_for+=("$bdf") 951 fi 952 done 953 if ((${#bdfs_to_wait_for[@]} > 0)); then 954 echo "Waiting for block devices as requested" 955 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 956 "$rootdir/scripts/sync_dev_uevents.sh" \ 957 block/disk \ 958 "${bdfs_to_wait_for[@]}" & 959 sync_pid=$! 960 fi 961fi 962 963if [[ $os == Linux ]]; then 964 if [ "$mode" == "config" ]; then 965 configure_linux 966 elif [ "$mode" == "cleanup" ]; then 967 cleanup_linux 968 clear_hugepages 969 elif [ "$mode" == "reset" ]; then 970 reset_linux 971 elif [ "$mode" == "status" ]; then 972 status_linux 973 elif [ "$mode" == "help" ]; then 974 usage $0 975 else 976 usage $0 "Invalid argument '$mode'" 977 fi 978else 979 if [ "$mode" == "config" ]; then 980 configure_freebsd 981 elif [ "$mode" == "reset" ]; then 982 reset_freebsd 983 elif [ "$mode" == "cleanup" ]; then 984 echo "setup.sh cleanup function not yet supported on $os" 985 elif [ "$mode" == "status" ]; then 986 status_freebsd 987 elif [ "$mode" == "help" ]; then 988 usage $0 989 else 990 usage $0 "Invalid argument '$mode'" 991 fi 992fi 993 994if [[ -e /proc/$sync_pid/status ]]; then 995 wait "$sync_pid" 996fi 997 998kmsg "spdk: $0 $* (done)" 999