1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2016 Intel Corporation 4# All rights reserved. 5# 6set -e 7shopt -s nullglob extglob 8 9os=$(uname -s) 10 11if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14fi 15 16rootdir=$(readlink -f $(dirname $0))/.. 17source "$rootdir/scripts/common.sh" 18 19function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|interactive|help]" 22 else 23 options="[config|reset|interactive|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "interactive Executes script in interactive mode." 48 echo "help Print this help message." 49 echo 50 echo "The following environment variables can be specified." 51 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 52 echo " For NUMA systems, the hugepages will be distributed on node0 by" 53 echo " default." 54 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 55 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 56 echo " Uses kernel's default for hugepages size." 57 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 58 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 59 echo " separated with comas. By default, NRHUGE will be applied on each node." 60 echo " Hugepages can be defined per node with e.g.:" 61 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 62 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 63 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 64 echo " setting is used." 65 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 66 echo " number of requested hugepages is lower from what's already" 67 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 68 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 69 echo " be made prior to allocation". 70 echo "PCI_ALLOWED" 71 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 72 echo " Each device must be specified as a full PCI address." 73 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 74 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 75 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 76 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 77 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 78 echo " will be bound." 79 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 80 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 81 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 82 echo " By default the current user will be used." 83 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 84 echo " bind devices to the given driver." 85 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 86 echo "PCI_BLOCK_SYNC_ON_RESET" 87 echo " If set in the environment, the attempt to wait for block devices associated" 88 echo " with given PCI device will be made upon reset" 89 echo "UNBIND_ENTIRE_IOMMU_GROUP" 90 echo " If set, all devices from nvme's iommu group will be unbound from their drivers." 91 echo " Use with caution." 92 exit 0 93} 94 95# In monolithic kernels the lsmod won't work. So 96# back that with a /sys/modules. We also check 97# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 98# contain needed info (like in Fedora-like OS). 99function check_for_driver() { 100 if [[ -z $1 ]]; then 101 return 0 102 fi 103 104 if lsmod | grep -q ${1//-/_}; then 105 return 1 106 fi 107 108 if [[ -d /sys/module/${1} || -d \ 109 /sys/module/${1//-/_} || -d \ 110 /sys/bus/pci/drivers/${1} || -d \ 111 /sys/bus/pci/drivers/${1//-/_} ]]; then 112 return 2 113 fi 114 return 0 115} 116 117function check_for_driver_freebsd() { 118 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 119 local search_paths path driver 120 IFS=";" read -ra search_paths < <(kldconfig -rU) 121 122 for driver in contigmem.ko nic_uio.ko; do 123 for path in "${search_paths[@]}"; do 124 [[ -f $path/$driver ]] && continue 2 125 done 126 return 1 127 done 128 return 0 129} 130 131function pci_dev_echo() { 132 local bdf="$1" 133 shift 134 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 135} 136 137function probe_driver() { 138 local bdf=$1 139 local driver_name=$2 140 old_driver_name=${drivers_d["$bdf"]:-no driver} 141 142 if [[ $driver_name == "$old_driver_name" ]]; then 143 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 144 return 0 145 fi 146 147 if [[ $old_driver_name != "no driver" ]]; then 148 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 149 fi 150 151 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 152 153 if [[ $driver_name == "none" ]]; then 154 return 0 155 fi 156 157 local probe_attempts=0 158 echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override" 159 while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do 160 pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)" 161 sleep 0.5 162 done 2> /dev/null 163 164 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 165 166 if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then 167 pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting" 168 return 1 169 fi 170} 171 172function linux_bind_driver() { 173 local bdf="$1" 174 local driver_name="$2" 175 176 probe_driver "$bdf" "$driver_name" 177 178 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 179 if [ -e "/dev/vfio/$iommu_group" ]; then 180 if [ -n "$TARGET_USER" ]; then 181 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 182 fi 183 fi 184 185 local iommug=("/sys/bus/pci/devices/$bdf/iommu_group/devices/"!($bdf)) 186 local _bdf _driver 187 if ((${#iommug[@]} > 0)) && [[ $driver_name == vfio* ]]; then 188 pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!" 189 for _bdf in "${iommug[@]}"; do 190 _driver=$(readlink -f "$_bdf/driver") 191 if [[ ! -e $_driver || ${_driver##*/} == "$driver_name" ]]; then 192 continue 193 fi 194 # See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device() 195 pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver##*/})" 196 pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound" 197 if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then 198 pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}" 199 drivers_d["${_bdf##*/}"]=${_driver##*/} 200 probe_driver "${_bdf##*/}" none 201 fi 202 done 203 fi 204 205} 206 207function linux_unbind_driver() { 208 local bdf="$1" 209 local old_driver_name=${drivers_d["$bdf"]:-no driver} 210 211 if [[ $old_driver_name == "no driver" ]]; then 212 pci_dev_echo "$bdf" "Not bound to any driver" 213 return 0 214 fi 215 216 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 217 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 218 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 219 fi 220 221 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 222} 223 224function linux_hugetlbfs_mounts() { 225 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 226} 227 228function get_block_dev_from_bdf() { 229 local bdf=$1 230 local block blocks=() ctrl 231 232 for block in /sys/block/*; do 233 if [[ $block == *nvme* ]]; then 234 ctrl=$(readlink -f "$block/device") ctrl=${ctrl##*/} 235 if [[ -e /sys/class/nvme/$ctrl && $(< "/sys/class/nvme/$ctrl/address") == "$bdf" ]]; then 236 blocks+=("${block##*/}") 237 fi 238 elif [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 239 blocks+=("${block##*/}") 240 fi 241 done 242 printf '%s\n' "${blocks[@]}" 243} 244 245function get_used_bdf_block_devs() { 246 local bdf=$1 247 local blocks block blockp dev mount holder 248 local used 249 250 hash lsblk &> /dev/null || return 1 251 blocks=($(get_block_dev_from_bdf "$bdf")) 252 253 for block in "${blocks[@]}"; do 254 # Check if the device is hold by some other, regardless if it's mounted 255 # or not. 256 for holder in "/sys/class/block/$block"*/holders/*; do 257 [[ -e $holder ]] || continue 258 blockp=${holder%/holders*} blockp=${blockp##*/} 259 if [[ -e $holder/slaves/$blockp ]]; then 260 used+=("holder@$blockp:${holder##*/}") 261 fi 262 done 263 while read -r dev mount; do 264 if [[ -e $mount ]]; then 265 used+=("mount@$block:$dev") 266 fi 267 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 268 if ((${#used[@]} == 0)); then 269 # Make sure we check if there's any valid data present on the target device 270 # regardless if it's being actively used or not. This is mainly done to make 271 # sure we don't miss more complex setups like ZFS pools, etc. 272 if block_in_use "$block" > /dev/null; then 273 used+=("data@$block") 274 fi 275 fi 276 done 277 278 if ((${#used[@]} > 0)); then 279 printf '%s\n' "${used[@]}" 280 fi 281} 282 283function collect_devices() { 284 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 285 286 local ids dev_type dev_id bdf bdfs in_use driver 287 288 ids+="PCI_DEVICE_ID_INTEL_IOAT" 289 ids+="|PCI_DEVICE_ID_INTEL_DSA" 290 ids+="|PCI_DEVICE_ID_INTEL_IAA" 291 ids+="|PCI_DEVICE_ID_VIRTIO" 292 ids+="|PCI_DEVICE_ID_INTEL_VMD" 293 ids+="|SPDK_PCI_CLASS_NVME" 294 295 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d types_d all_devices_type_d 296 297 while read -r _ dev_type dev_id; do 298 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 299 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 300 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 301 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 302 types_d["$dev_type"]=1 303 for bdf in "${bdfs[@]}"; do 304 in_use=0 305 if [[ $1 != status ]]; then 306 if ! pci_can_use "$bdf"; then 307 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 308 in_use=1 309 fi 310 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 311 if ! verify_bdf_block_devs "$bdf"; then 312 in_use=1 313 fi 314 fi 315 if [[ $dev_type == vmd ]]; then 316 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 317 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 318 in_use=1 319 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 320 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 321 if [ "$mode" == "config" ]; then 322 cat <<- MESSAGE 323 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 324 which are attached to the kernel NVMe driver,the binding process may go faster 325 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 326 NVMe SSDs, and then run again to unbind the VMD devices." 327 MESSAGE 328 fi 329 fi 330 fi 331 fi 332 fi 333 eval "${dev_type}_d[$bdf]=$in_use" 334 all_devices_d["$bdf"]=$in_use 335 all_devices_type_d["$bdf"]=$dev_type 336 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 337 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 338 drivers_d["$bdf"]=${driver##*/} 339 else 340 drivers_d["$bdf"]="" 341 fi 342 done 343 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 344} 345 346function collect_driver() { 347 local bdf=$1 348 local drivers driver 349 350 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 351 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 352 # Pick first entry in case multiple aliases are bound to a driver. 353 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 354 driver=${driver##*/} 355 else 356 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 357 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 358 [[ -n ${dsa_d["$bdf"]} ]] && driver=dsa 359 [[ -n ${iaa_d["$bdf"]} ]] && driver=iaa 360 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 361 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 362 fi 2> /dev/null 363 echo "$driver" 364} 365 366function verify_bdf_block_devs() { 367 local bdf=$1 368 local blknames 369 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 370 371 if ((${#blknames[@]} > 0)); then 372 local IFS="," 373 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 374 return 1 375 fi 376} 377 378function configure_linux_pci() { 379 local driver_path="" 380 driver_name="" 381 igb_uio_fallback="" 382 383 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 384 # igb_uio is a common driver to override with and it depends on uio. 385 modprobe uio || true 386 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 387 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 388 fi 389 fi 390 391 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 392 driver_name=none 393 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 394 driver_path="$DRIVER_OVERRIDE" 395 driver_name="${DRIVER_OVERRIDE##*/}" 396 # modprobe and the sysfs don't use the .ko suffix. 397 driver_name=${driver_name%.ko} 398 # path = name -> there is no path 399 if [[ "$driver_path" = "$driver_name" ]]; then 400 driver_path="" 401 fi 402 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 403 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 404 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 405 driver_name=vfio-pci 406 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 407 # should be done automatically by modprobe since this particular module should 408 # be a part of vfio-pci dependencies, however, on some distros, it seems that 409 # it's not the case. See #1689. 410 if modinfo vfio_iommu_type1 > /dev/null; then 411 modprobe vfio_iommu_type1 412 fi 413 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 414 driver_name=uio_pci_generic 415 elif [[ -e $igb_uio_fallback ]]; then 416 driver_path="$igb_uio_fallback" 417 driver_name="igb_uio" 418 echo "WARNING: uio_pci_generic not detected - using $driver_name" 419 else 420 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 421 return 1 422 fi 423 424 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 425 if [[ $driver_name != "none" ]]; then 426 if [[ -n "$driver_path" ]]; then 427 insmod $driver_path || true 428 else 429 modprobe $driver_name 430 fi 431 fi 432 433 for bdf in "${!all_devices_d[@]}"; do 434 if ((all_devices_d["$bdf"] == 0)); then 435 if [[ -n ${nvme_d["$bdf"]} ]]; then 436 # Some nvme controllers may take significant amount of time while being 437 # unbound from the driver. Put that task into background to speed up the 438 # whole process. Currently this is done only for the devices bound to the 439 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 440 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 441 linux_bind_driver "$bdf" "$driver_name" & 442 else 443 linux_bind_driver "$bdf" "$driver_name" 444 fi 445 fi 446 done 447 wait 448 449 echo "1" > "/sys/bus/pci/rescan" 450} 451 452function cleanup_linux() { 453 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 454 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 455 456 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 457 if [[ -d $XDG_RUNTIME_DIR ]]; then 458 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 459 fi 460 461 for dir in "${dirs_to_clean[@]}"; do 462 files_to_clean+=("$dir/"*) 463 done 464 file_locks+=(/var/tmp/spdk_pci_lock*) 465 file_locks+=(/var/tmp/spdk_cpu_lock*) 466 467 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 468 files_to_clean+=("${file_locks[@]}") 469 470 # This may fail in case path that readlink attempts to resolve suddenly 471 # disappears (as it may happen with terminating processes). 472 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 473 474 if ((${#opened_files[@]} == 0)); then 475 echo "Can't get list of opened files!" 476 exit 1 477 fi 478 479 echo 'Cleaning' 480 for f in "${files_to_clean[@]}"; do 481 [[ -e $f ]] || continue 482 if [[ ${opened_files[*]} != *"$f"* ]]; then 483 echo "Removing: $f" 484 rm $f 485 else 486 echo "Still open: $f" 487 fi 488 done 489 490 for dir in "${dirs_to_clean[@]}"; do 491 [[ -d $dir ]] || continue 492 if [[ ${opened_files[*]} != *"$dir"* ]]; then 493 echo "Removing: $dir" 494 rmdir $dir 495 else 496 echo "Still open: $dir" 497 fi 498 done 499 echo "Clean" 500} 501 502check_hugepages_alloc() { 503 local hp_int=$1 504 local allocated_hugepages 505 506 allocated_hugepages=$(< "$hp_int") 507 508 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 509 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 510 return 0 511 fi 512 513 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 514 515 allocated_hugepages=$(< "$hp_int") 516 if ((allocated_hugepages < NRHUGE)); then 517 cat <<- ERROR 518 519 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 520 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 521 ERROR 522 return 1 523 fi 524} 525 526clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 527 528configure_linux_hugepages() { 529 local node system_nodes 530 local nodes_to_use nodes_hp 531 532 if [[ $CLEAR_HUGE == yes ]]; then 533 clear_hugepages 534 fi 535 536 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 537 clear_hugepages 538 check_hugepages_alloc /proc/sys/vm/nr_hugepages 539 return 0 540 fi 541 542 for node in /sys/devices/system/node/node*; do 543 [[ -e $node ]] || continue 544 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 545 done 546 547 if ((${#nodes[@]} == 0)); then 548 # No NUMA support? Fallback to common interface 549 check_hugepages_alloc /proc/sys/vm/nr_hugepages 550 return 0 551 fi 552 553 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 554 if ((${#nodes_to_use[@]} == 0)); then 555 nodes_to_use[0]=0 556 fi 557 558 # Align indexes with node ids 559 for node in "${!nodes_to_use[@]}"; do 560 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 561 eval "${nodes_to_use[node]}" 562 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 563 nodes_hp[nodes_to_use[node]]=$NRHUGE 564 fi 565 done 566 567 for node in "${!nodes_hp[@]}"; do 568 if [[ -z ${nodes[node]} ]]; then 569 echo "Node $node doesn't exist, ignoring" >&2 570 continue 571 fi 572 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 573 done 574} 575 576function configure_linux() { 577 configure_linux_pci 578 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 579 580 if [ -z "$hugetlbfs_mounts" ]; then 581 hugetlbfs_mounts=/mnt/huge 582 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 583 mkdir -p "$hugetlbfs_mounts" 584 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 585 fi 586 587 configure_linux_hugepages 588 589 if [ "$driver_name" = "vfio-pci" ]; then 590 if [ -n "$TARGET_USER" ]; then 591 for mount in $hugetlbfs_mounts; do 592 chown "$TARGET_USER" "$mount" 593 chmod g+w "$mount" 594 done 595 596 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 597 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 598 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 599 cat <<- MEMLOCK 600 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 601 602 This is the maximum amount of memory you will be 603 able to use with DPDK and VFIO if run as user "$TARGET_USER". 604 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 605 MEMLOCK 606 if ((MEMLOCK_AMNT < 65536)); then 607 echo "" 608 echo "## WARNING: memlock limit is less than 64MB" 609 echo -n "## DPDK with VFIO may not be able to initialize " 610 echo "if run as user \"$TARGET_USER\"." 611 fi 612 fi 613 fi 614 fi 615 616 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 617 # Some distros build msr as a module. Make sure it's loaded to ensure 618 # DPDK can easily figure out the TSC rate rather than relying on 100ms 619 # sleeps. 620 modprobe msr &> /dev/null || true 621 fi 622} 623 624function reset_linux_pci() { 625 # virtio 626 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 627 # Requires some more investigation - for example, some kernels do not seem to have 628 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 629 # underscore vs. dash right in the virtio_scsi name. 630 modprobe virtio-pci || true 631 for bdf in "${!all_devices_d[@]}"; do 632 ((all_devices_d["$bdf"] == 0)) || continue 633 634 driver=$(collect_driver "$bdf") 635 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 636 linux_bind_driver "$bdf" "$driver" 637 else 638 linux_unbind_driver "$bdf" 639 fi 640 done 641 642 echo "1" > "/sys/bus/pci/rescan" 643} 644 645function reset_linux() { 646 reset_linux_pci 647 for mount in $(linux_hugetlbfs_mounts); do 648 for hp in "$mount"/spdk*map_*; do 649 flock -n "$hp" true && rm -f "$hp" 650 done 651 done 652 rm -f /run/.spdk* 653} 654 655function status_linux() { 656 echo "Hugepages" >&2 657 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 658 659 numa_nodes=0 660 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 661 numa_nodes=$((numa_nodes + 1)) 662 free_pages=$(cat $path/free_hugepages) 663 all_pages=$(cat $path/nr_hugepages) 664 665 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 666 667 node=${BASH_REMATCH[1]} 668 huge_size=${BASH_REMATCH[2]} 669 670 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 671 done 672 673 # fall back to system-wide hugepages 674 if [ "$numa_nodes" = "0" ]; then 675 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 676 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 677 node="-" 678 huge_size="$HUGEPGSZ" 679 680 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 681 fi 682 683 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 684 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 685 686 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 687 688 for bdf in "${sorted_bdfs[@]}"; do 689 driver=${drivers_d["$bdf"]} 690 if [ "$numa_nodes" = "0" ]; then 691 node="-" 692 else 693 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 694 if ((node == -1)); then 695 node=unknown 696 fi 697 fi 698 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 699 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 700 else 701 name="-" 702 fi 703 704 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 705 blknames=($(get_block_dev_from_bdf "$bdf")) 706 else 707 blknames=("-") 708 fi 709 710 desc="" 711 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 712 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 713 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 714 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 715 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 716 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 717 718 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 719 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 720 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 721 done 722} 723 724function status_freebsd() { 725 local pci 726 727 status_print() ( 728 local type=$1 729 local dev driver 730 731 shift 732 733 for pci; do 734 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 735 "$type" \ 736 "$pci" \ 737 "${pci_ids_vendor["$pci"]}" \ 738 "${pci_ids_device["$pci"]}" \ 739 "${pci_bus_driver["$pci"]}" 740 done | sort -k2,2 741 ) 742 743 local contigmem=present 744 local contigmem_buffer_size 745 local contigmem_num_buffers 746 747 if ! kldstat -q -m contigmem; then 748 contigmem="not present" 749 fi 750 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 751 contigmem_buffer_size="not set" 752 fi 753 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 754 contigmem_num_buffers="not set" 755 fi 756 757 cat <<- BSD_INFO 758 Contigmem ($contigmem) 759 Buffer Size: $contigmem_buffer_size 760 Num Buffers: $contigmem_num_buffers 761 762 BSD_INFO 763 764 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 765 "Type" "BDF" "Vendor" "Device" "Driver" >&2 766 767 status_print "NVMe" "${!nvme_d[@]}" 768 status_print "I/OAT" "${!ioat_d[@]}" 769 status_print "DSA" "${!dsa_d[@]}" 770 status_print "IAA" "${!iaa_d[@]}" 771 status_print "VMD" "${!vmd_d[@]}" 772} 773 774function configure_freebsd_pci() { 775 local BDFS 776 777 BDFS+=("${!nvme_d[@]}") 778 BDFS+=("${!ioat_d[@]}") 779 BDFS+=("${!dsa_d[@]}") 780 BDFS+=("${!iaa_d[@]}") 781 BDFS+=("${!vmd_d[@]}") 782 783 # Drop the domain part from all the addresses 784 BDFS=("${BDFS[@]#*:}") 785 786 local IFS="," 787 kldunload nic_uio.ko || true 788 kenv hw.nic_uio.bdfs="${BDFS[*]}" 789 kldload nic_uio.ko 790} 791 792function configure_freebsd() { 793 if ! check_for_driver_freebsd; then 794 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 795 return 1 796 fi 797 configure_freebsd_pci 798 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 799 # previous value, unload contigmem so that we can reload with the new value. 800 if kldstat -q -m contigmem; then 801 # contigmem may be loaded, but the kernel environment doesn't have to 802 # be necessarily set at this point. If it isn't, kenv will fail to 803 # pick up the hw. options. Handle it. 804 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 805 contigmem_num_buffers=-1 806 fi 2> /dev/null 807 if ((contigmem_num_buffers != HUGEMEM / 256)); then 808 kldunload contigmem.ko 809 fi 810 fi 811 if ! kldstat -q -m contigmem; then 812 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 813 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 814 kldload contigmem.ko 815 fi 816} 817 818function reset_freebsd() { 819 kldunload contigmem.ko || true 820 kldunload nic_uio.ko || true 821} 822 823function set_hp() { 824 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 825 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 826 unset -v HUGEPGSZ 827 fi 828 829 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 830 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 831 NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 832} 833 834CMD=reset cache_pci_bus 835 836mode=$1 837 838if [ -z "$mode" ]; then 839 mode="config" 840fi 841 842: ${HUGEMEM:=2048} 843: ${PCI_ALLOWED:=""} 844: ${PCI_BLOCKED:=""} 845 846if [ -n "$NVME_ALLOWED" ]; then 847 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 848fi 849 850if [ -n "$SKIP_PCI" ]; then 851 PCI_ALLOWED="none" 852fi 853 854if [ -z "$TARGET_USER" ]; then 855 TARGET_USER="$SUDO_USER" 856 if [ -z "$TARGET_USER" ]; then 857 TARGET_USER=$(logname 2> /dev/null) || true 858 fi 859fi 860 861collect_devices "$mode" 862 863if [[ $os == Linux ]]; then 864 set_hp 865fi 866 867if [[ $mode == interactive ]]; then 868 source "$rootdir/scripts/common/setup/interactive.sh" 869 main_menu "$2" || exit 0 870fi 871 872if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 873 # Note that this will wait only for the first block device attached to 874 # a given storage controller. For nvme this may miss some of the devs 875 # in case multiple namespaces are being in place. 876 # FIXME: Wait for nvme controller(s) to be in live state and determine 877 # number of configured namespaces, build list of potential block devs 878 # and pass them to sync_dev_uevents. Is it worth the effort? 879 bdfs_to_wait_for=() 880 for bdf in "${!all_devices_d[@]}"; do 881 ((all_devices_d["$bdf"] == 0)) || continue 882 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 883 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 884 bdfs_to_wait_for+=("$bdf") 885 fi 886 done 887 if ((${#bdfs_to_wait_for[@]} > 0)); then 888 echo "Waiting for block devices as requested" 889 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 890 "$rootdir/scripts/sync_dev_uevents.sh" \ 891 block/disk \ 892 "${bdfs_to_wait_for[@]}" & 893 sync_pid=$! 894 fi 895fi 896 897if [[ $os == Linux ]]; then 898 if [ "$mode" == "config" ]; then 899 configure_linux 900 elif [ "$mode" == "cleanup" ]; then 901 cleanup_linux 902 clear_hugepages 903 elif [ "$mode" == "reset" ]; then 904 reset_linux 905 elif [ "$mode" == "status" ]; then 906 status_linux 907 elif [ "$mode" == "help" ]; then 908 usage $0 909 else 910 usage $0 "Invalid argument '$mode'" 911 fi 912else 913 if [ "$mode" == "config" ]; then 914 configure_freebsd 915 elif [ "$mode" == "reset" ]; then 916 reset_freebsd 917 elif [ "$mode" == "cleanup" ]; then 918 echo "setup.sh cleanup function not yet supported on $os" 919 elif [ "$mode" == "status" ]; then 920 status_freebsd 921 elif [ "$mode" == "help" ]; then 922 usage $0 923 else 924 usage $0 "Invalid argument '$mode'" 925 fi 926fi 927 928if [[ -e /proc/$sync_pid/status ]]; then 929 wait "$sync_pid" 930fi 931