1 #!/usr/bin/env bash 2 # SPDX-License-Identifier: BSD-3-Clause 3 # Copyright (C) 2016 Intel Corporation 4 # All rights reserved. 5 # 6 set -e 7 shopt -s nullglob extglob 8 9 os=$(uname -s) 10 11 if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14 fi 15 16 rootdir=$(readlink -f $(dirname $0))/.. 17 source "$rootdir/scripts/common.sh" 18 19 function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|help]" 22 else 23 options="[config|reset|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "help Print this help message." 48 echo 49 echo "The following environment variables can be specified." 50 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 51 echo " For NUMA systems, the hugepages will be distributed on node0 by" 52 echo " default." 53 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 54 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 55 echo " Uses kernel's default for hugepages size." 56 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 57 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 58 echo " separated with comas. By default, NRHUGE will be applied on each node." 59 echo " Hugepages can be defined per node with e.g.:" 60 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 61 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 62 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 63 echo " setting is used." 64 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 65 echo " number of requested hugepages is lower from what's already" 66 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 67 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 68 echo " be made prior to allocation". 69 echo "PCI_ALLOWED" 70 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 71 echo " Each device must be specified as a full PCI address." 72 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 73 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 74 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 75 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 76 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 77 echo " will be bound." 78 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 79 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 80 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 81 echo " By default the current user will be used." 82 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 83 echo " bind devices to the given driver." 84 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 85 echo "PCI_BLOCK_SYNC_ON_RESET" 86 echo " If set in the environment, the attempt to wait for block devices associated" 87 echo " with given PCI device will be made upon reset" 88 echo "UNBIND_ENTIRE_IOMMU_GROUP" 89 echo " If set, all devices from nvme's iommu group will be unbound from their drivers." 90 echo " Use with caution." 91 exit 0 92 } 93 94 # In monolithic kernels the lsmod won't work. So 95 # back that with a /sys/modules. We also check 96 # /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 97 # contain needed info (like in Fedora-like OS). 98 function check_for_driver() { 99 if [[ -z $1 ]]; then 100 return 0 101 fi 102 103 if lsmod | grep -q ${1//-/_}; then 104 return 1 105 fi 106 107 if [[ -d /sys/module/${1} || -d \ 108 /sys/module/${1//-/_} || -d \ 109 /sys/bus/pci/drivers/${1} || -d \ 110 /sys/bus/pci/drivers/${1//-/_} ]]; then 111 return 2 112 fi 113 return 0 114 } 115 116 function check_for_driver_freebsd() { 117 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 118 local search_paths path driver 119 IFS=";" read -ra search_paths < <(kldconfig -rU) 120 121 for driver in contigmem.ko nic_uio.ko; do 122 for path in "${search_paths[@]}"; do 123 [[ -f $path/$driver ]] && continue 2 124 done 125 return 1 126 done 127 return 0 128 } 129 130 function pci_dev_echo() { 131 local bdf="$1" 132 shift 133 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 134 } 135 136 function probe_driver() { 137 local bdf=$1 138 local driver_name=$2 139 old_driver_name=${drivers_d["$bdf"]:-no driver} 140 141 if [[ $driver_name == "$old_driver_name" ]]; then 142 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 143 return 0 144 fi 145 146 if [[ $old_driver_name != "no driver" ]]; then 147 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 148 fi 149 150 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 151 152 if [[ $driver_name == "none" ]]; then 153 return 0 154 fi 155 156 local probe_attempts=0 157 echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override" 158 while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do 159 pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)" 160 sleep 0.5 161 done 2> /dev/null 162 163 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 164 165 if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then 166 pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting" 167 return 1 168 fi 169 } 170 171 function linux_bind_driver() { 172 local bdf="$1" 173 local driver_name="$2" 174 175 probe_driver "$bdf" "$driver_name" 176 177 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 178 if [ -e "/dev/vfio/$iommu_group" ]; then 179 if [ -n "$TARGET_USER" ]; then 180 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 181 fi 182 fi 183 184 local iommug=("/sys/bus/pci/devices/$bdf/iommu_group/devices/"!($bdf)) 185 local _bdf _driver 186 if ((${#iommug[@]} > 0)) && [[ $driver_name == vfio* ]]; then 187 pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!" 188 for _bdf in "${iommug[@]}"; do 189 _driver=$(readlink -f "$_bdf/driver") 190 if [[ ! -e $_driver || ${_driver##*/} == "$driver_name" ]]; then 191 continue 192 fi 193 # See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device() 194 pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver##*/})" 195 pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound" 196 if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then 197 pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}" 198 drivers_d["${_bdf##*/}"]=${_driver##*/} 199 probe_driver "${_bdf##*/}" none 200 fi 201 done 202 fi 203 204 } 205 206 function linux_unbind_driver() { 207 local bdf="$1" 208 local old_driver_name=${drivers_d["$bdf"]:-no driver} 209 210 if [[ $old_driver_name == "no driver" ]]; then 211 pci_dev_echo "$bdf" "Not bound to any driver" 212 return 0 213 fi 214 215 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 216 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 217 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 218 fi 219 220 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 221 } 222 223 function linux_hugetlbfs_mounts() { 224 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 225 } 226 227 function get_block_dev_from_bdf() { 228 local bdf=$1 229 local block blocks=() ctrl 230 231 for block in /sys/block/*; do 232 if [[ $block == *nvme* ]]; then 233 ctrl=${block##*/} ctrl=${ctrl%n*} 234 if [[ -e /sys/class/nvme/$ctrl && $(< "/sys/class/nvme/$ctrl/address") == "$bdf" ]]; then 235 blocks+=("${block##*/}") 236 fi 237 elif [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 238 blocks+=("${block##*/}") 239 fi 240 done 241 printf '%s\n' "${blocks[@]}" 242 } 243 244 function get_used_bdf_block_devs() { 245 local bdf=$1 246 local blocks block blockp dev mount holder 247 local used 248 249 hash lsblk &> /dev/null || return 1 250 blocks=($(get_block_dev_from_bdf "$bdf")) 251 252 for block in "${blocks[@]}"; do 253 # Check if the device is hold by some other, regardless if it's mounted 254 # or not. 255 for holder in "/sys/class/block/$block"*/holders/*; do 256 [[ -e $holder ]] || continue 257 blockp=${holder%/holders*} blockp=${blockp##*/} 258 if [[ -e $holder/slaves/$blockp ]]; then 259 used+=("holder@$blockp:${holder##*/}") 260 fi 261 done 262 while read -r dev mount; do 263 if [[ -e $mount ]]; then 264 used+=("mount@$block:$dev") 265 fi 266 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 267 if ((${#used[@]} == 0)); then 268 # Make sure we check if there's any valid data present on the target device 269 # regardless if it's being actively used or not. This is mainly done to make 270 # sure we don't miss more complex setups like ZFS pools, etc. 271 if block_in_use "$block" > /dev/null; then 272 used+=("data@$block") 273 fi 274 fi 275 done 276 277 if ((${#used[@]} > 0)); then 278 printf '%s\n' "${used[@]}" 279 fi 280 } 281 282 function collect_devices() { 283 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 284 285 local ids dev_type dev_id bdf bdfs in_use driver 286 287 ids+="PCI_DEVICE_ID_INTEL_IOAT" 288 ids+="|PCI_DEVICE_ID_INTEL_DSA" 289 ids+="|PCI_DEVICE_ID_INTEL_IAA" 290 ids+="|PCI_DEVICE_ID_VIRTIO" 291 ids+="|PCI_DEVICE_ID_INTEL_VMD" 292 ids+="|SPDK_PCI_CLASS_NVME" 293 294 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d 295 296 while read -r _ dev_type dev_id; do 297 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 298 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 299 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 300 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 301 for bdf in "${bdfs[@]}"; do 302 in_use=0 303 if [[ $1 != status ]]; then 304 if ! pci_can_use "$bdf"; then 305 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 306 in_use=1 307 fi 308 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 309 if ! verify_bdf_block_devs "$bdf"; then 310 in_use=1 311 fi 312 fi 313 if [[ $dev_type == vmd ]]; then 314 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 315 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 316 in_use=1 317 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 318 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 319 if [ "$mode" == "config" ]; then 320 cat <<- MESSAGE 321 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 322 which are attached to the kernel NVMe driver,the binding process may go faster 323 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 324 NVMe SSDs, and then run again to unbind the VMD devices." 325 MESSAGE 326 fi 327 fi 328 fi 329 fi 330 fi 331 eval "${dev_type}_d[$bdf]=$in_use" 332 all_devices_d["$bdf"]=$in_use 333 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 334 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 335 drivers_d["$bdf"]=${driver##*/} 336 fi 337 done 338 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 339 } 340 341 function collect_driver() { 342 local bdf=$1 343 local drivers driver 344 345 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 346 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 347 # Pick first entry in case multiple aliases are bound to a driver. 348 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 349 driver=${driver##*/} 350 else 351 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 352 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 353 [[ -n ${dsa_d["$bdf"]} ]] && driver=dsa 354 [[ -n ${iaa_d["$bdf"]} ]] && driver=iaa 355 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 356 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 357 fi 2> /dev/null 358 echo "$driver" 359 } 360 361 function verify_bdf_block_devs() { 362 local bdf=$1 363 local blknames 364 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 365 366 if ((${#blknames[@]} > 0)); then 367 local IFS="," 368 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 369 return 1 370 fi 371 } 372 373 function configure_linux_pci() { 374 local driver_path="" 375 driver_name="" 376 igb_uio_fallback="" 377 378 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 379 # igb_uio is a common driver to override with and it depends on uio. 380 modprobe uio || true 381 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 382 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 383 fi 384 fi 385 386 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 387 driver_name=none 388 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 389 driver_path="$DRIVER_OVERRIDE" 390 driver_name="${DRIVER_OVERRIDE##*/}" 391 # modprobe and the sysfs don't use the .ko suffix. 392 driver_name=${driver_name%.ko} 393 # path = name -> there is no path 394 if [[ "$driver_path" = "$driver_name" ]]; then 395 driver_path="" 396 fi 397 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 398 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 399 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 400 driver_name=vfio-pci 401 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 402 # should be done automatically by modprobe since this particular module should 403 # be a part of vfio-pci dependencies, however, on some distros, it seems that 404 # it's not the case. See #1689. 405 if modinfo vfio_iommu_type1 > /dev/null; then 406 modprobe vfio_iommu_type1 407 fi 408 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 409 driver_name=uio_pci_generic 410 elif [[ -e $igb_uio_fallback ]]; then 411 driver_path="$igb_uio_fallback" 412 driver_name="igb_uio" 413 echo "WARNING: uio_pci_generic not detected - using $driver_name" 414 else 415 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 416 return 1 417 fi 418 419 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 420 if [[ $driver_name != "none" ]]; then 421 if [[ -n "$driver_path" ]]; then 422 insmod $driver_path || true 423 else 424 modprobe $driver_name 425 fi 426 fi 427 428 for bdf in "${!all_devices_d[@]}"; do 429 if ((all_devices_d["$bdf"] == 0)); then 430 if [[ -n ${nvme_d["$bdf"]} ]]; then 431 # Some nvme controllers may take significant amount of time while being 432 # unbound from the driver. Put that task into background to speed up the 433 # whole process. Currently this is done only for the devices bound to the 434 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 435 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 436 linux_bind_driver "$bdf" "$driver_name" & 437 else 438 linux_bind_driver "$bdf" "$driver_name" 439 fi 440 fi 441 done 442 wait 443 444 echo "1" > "/sys/bus/pci/rescan" 445 } 446 447 function cleanup_linux() { 448 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 449 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 450 451 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 452 if [[ -d $XDG_RUNTIME_DIR ]]; then 453 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 454 fi 455 456 for dir in "${dirs_to_clean[@]}"; do 457 files_to_clean+=("$dir/"*) 458 done 459 file_locks+=(/var/tmp/spdk_pci_lock*) 460 file_locks+=(/var/tmp/spdk_cpu_lock*) 461 462 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 463 files_to_clean+=("${file_locks[@]}") 464 465 # This may fail in case path that readlink attempts to resolve suddenly 466 # disappears (as it may happen with terminating processes). 467 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 468 469 if ((${#opened_files[@]} == 0)); then 470 echo "Can't get list of opened files!" 471 exit 1 472 fi 473 474 echo 'Cleaning' 475 for f in "${files_to_clean[@]}"; do 476 [[ -e $f ]] || continue 477 if [[ ${opened_files[*]} != *"$f"* ]]; then 478 echo "Removing: $f" 479 rm $f 480 else 481 echo "Still open: $f" 482 fi 483 done 484 485 for dir in "${dirs_to_clean[@]}"; do 486 [[ -d $dir ]] || continue 487 if [[ ${opened_files[*]} != *"$dir"* ]]; then 488 echo "Removing: $dir" 489 rmdir $dir 490 else 491 echo "Still open: $dir" 492 fi 493 done 494 echo "Clean" 495 } 496 497 check_hugepages_alloc() { 498 local hp_int=$1 499 local allocated_hugepages 500 501 allocated_hugepages=$(< "$hp_int") 502 503 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 504 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 505 return 0 506 fi 507 508 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 509 510 allocated_hugepages=$(< "$hp_int") 511 if ((allocated_hugepages < NRHUGE)); then 512 cat <<- ERROR 513 514 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 515 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 516 ERROR 517 return 1 518 fi 519 } 520 521 clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 522 523 configure_linux_hugepages() { 524 local node system_nodes 525 local nodes_to_use nodes_hp 526 527 if [[ $CLEAR_HUGE == yes ]]; then 528 clear_hugepages 529 fi 530 531 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 532 clear_hugepages 533 check_hugepages_alloc /proc/sys/vm/nr_hugepages 534 return 0 535 fi 536 537 for node in /sys/devices/system/node/node*; do 538 [[ -e $node ]] || continue 539 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 540 done 541 542 if ((${#nodes[@]} == 0)); then 543 # No NUMA support? Fallback to common interface 544 check_hugepages_alloc /proc/sys/vm/nr_hugepages 545 return 0 546 fi 547 548 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 549 if ((${#nodes_to_use[@]} == 0)); then 550 nodes_to_use[0]=0 551 fi 552 553 # Align indexes with node ids 554 for node in "${!nodes_to_use[@]}"; do 555 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 556 eval "${nodes_to_use[node]}" 557 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 558 nodes_hp[nodes_to_use[node]]=$NRHUGE 559 fi 560 done 561 562 for node in "${!nodes_hp[@]}"; do 563 if [[ -z ${nodes[node]} ]]; then 564 echo "Node $node doesn't exist, ignoring" >&2 565 continue 566 fi 567 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 568 done 569 } 570 571 function configure_linux() { 572 configure_linux_pci 573 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 574 575 if [ -z "$hugetlbfs_mounts" ]; then 576 hugetlbfs_mounts=/mnt/huge 577 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 578 mkdir -p "$hugetlbfs_mounts" 579 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 580 fi 581 582 configure_linux_hugepages 583 584 if [ "$driver_name" = "vfio-pci" ]; then 585 if [ -n "$TARGET_USER" ]; then 586 for mount in $hugetlbfs_mounts; do 587 chown "$TARGET_USER" "$mount" 588 chmod g+w "$mount" 589 done 590 591 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 592 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 593 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 594 cat <<- MEMLOCK 595 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 596 597 This is the maximum amount of memory you will be 598 able to use with DPDK and VFIO if run as user "$TARGET_USER". 599 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 600 MEMLOCK 601 if ((MEMLOCK_AMNT < 65536)); then 602 echo "" 603 echo "## WARNING: memlock limit is less than 64MB" 604 echo -n "## DPDK with VFIO may not be able to initialize " 605 echo "if run as user \"$TARGET_USER\"." 606 fi 607 fi 608 fi 609 fi 610 611 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 612 # Some distros build msr as a module. Make sure it's loaded to ensure 613 # DPDK can easily figure out the TSC rate rather than relying on 100ms 614 # sleeps. 615 modprobe msr &> /dev/null || true 616 fi 617 } 618 619 function reset_linux_pci() { 620 # virtio 621 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 622 # Requires some more investigation - for example, some kernels do not seem to have 623 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 624 # underscore vs. dash right in the virtio_scsi name. 625 modprobe virtio-pci || true 626 for bdf in "${!all_devices_d[@]}"; do 627 ((all_devices_d["$bdf"] == 0)) || continue 628 629 driver=$(collect_driver "$bdf") 630 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 631 linux_bind_driver "$bdf" "$driver" 632 else 633 linux_unbind_driver "$bdf" 634 fi 635 done 636 637 echo "1" > "/sys/bus/pci/rescan" 638 } 639 640 function reset_linux() { 641 reset_linux_pci 642 for mount in $(linux_hugetlbfs_mounts); do 643 for hp in "$mount"/spdk*map_*; do 644 flock -n "$hp" true && rm -f "$hp" 645 done 646 done 647 rm -f /run/.spdk* 648 } 649 650 function status_linux() { 651 echo "Hugepages" >&2 652 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 653 654 numa_nodes=0 655 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 656 numa_nodes=$((numa_nodes + 1)) 657 free_pages=$(cat $path/free_hugepages) 658 all_pages=$(cat $path/nr_hugepages) 659 660 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 661 662 node=${BASH_REMATCH[1]} 663 huge_size=${BASH_REMATCH[2]} 664 665 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 666 done 667 668 # fall back to system-wide hugepages 669 if [ "$numa_nodes" = "0" ]; then 670 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 671 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 672 node="-" 673 huge_size="$HUGEPGSZ" 674 675 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 676 fi 677 678 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 679 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 680 681 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 682 683 for bdf in "${sorted_bdfs[@]}"; do 684 driver=${drivers_d["$bdf"]} 685 if [ "$numa_nodes" = "0" ]; then 686 node="-" 687 else 688 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 689 if ((node == -1)); then 690 node=unknown 691 fi 692 fi 693 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 694 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 695 else 696 name="-" 697 fi 698 699 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 700 blknames=($(get_block_dev_from_bdf "$bdf")) 701 else 702 blknames=("-") 703 fi 704 705 desc="" 706 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 707 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 708 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 709 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 710 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 711 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 712 713 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 714 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 715 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 716 done 717 } 718 719 function status_freebsd() { 720 local pci 721 722 status_print() ( 723 local type=$1 724 local dev driver 725 726 shift 727 728 for pci; do 729 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 730 "$type" \ 731 "$pci" \ 732 "${pci_ids_vendor["$pci"]}" \ 733 "${pci_ids_device["$pci"]}" \ 734 "${pci_bus_driver["$pci"]}" 735 done | sort -k2,2 736 ) 737 738 local contigmem=present 739 local contigmem_buffer_size 740 local contigmem_num_buffers 741 742 if ! kldstat -q -m contigmem; then 743 contigmem="not present" 744 fi 745 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 746 contigmem_buffer_size="not set" 747 fi 748 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 749 contigmem_num_buffers="not set" 750 fi 751 752 cat <<- BSD_INFO 753 Contigmem ($contigmem) 754 Buffer Size: $contigmem_buffer_size 755 Num Buffers: $contigmem_num_buffers 756 757 BSD_INFO 758 759 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 760 "Type" "BDF" "Vendor" "Device" "Driver" >&2 761 762 status_print "NVMe" "${!nvme_d[@]}" 763 status_print "I/OAT" "${!ioat_d[@]}" 764 status_print "DSA" "${!dsa_d[@]}" 765 status_print "IAA" "${!iaa_d[@]}" 766 status_print "VMD" "${!vmd_d[@]}" 767 } 768 769 function configure_freebsd_pci() { 770 local BDFS 771 772 BDFS+=("${!nvme_d[@]}") 773 BDFS+=("${!ioat_d[@]}") 774 BDFS+=("${!dsa_d[@]}") 775 BDFS+=("${!iaa_d[@]}") 776 BDFS+=("${!vmd_d[@]}") 777 778 # Drop the domain part from all the addresses 779 BDFS=("${BDFS[@]#*:}") 780 781 local IFS="," 782 kldunload nic_uio.ko || true 783 kenv hw.nic_uio.bdfs="${BDFS[*]}" 784 kldload nic_uio.ko 785 } 786 787 function configure_freebsd() { 788 if ! check_for_driver_freebsd; then 789 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 790 return 1 791 fi 792 configure_freebsd_pci 793 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 794 # previous value, unload contigmem so that we can reload with the new value. 795 if kldstat -q -m contigmem; then 796 # contigmem may be loaded, but the kernel environment doesn't have to 797 # be necessarily set at this point. If it isn't, kenv will fail to 798 # pick up the hw. options. Handle it. 799 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 800 contigmem_num_buffers=-1 801 fi 2> /dev/null 802 if ((contigmem_num_buffers != HUGEMEM / 256)); then 803 kldunload contigmem.ko 804 fi 805 fi 806 if ! kldstat -q -m contigmem; then 807 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 808 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 809 kldload contigmem.ko 810 fi 811 } 812 813 function reset_freebsd() { 814 kldunload contigmem.ko || true 815 kldunload nic_uio.ko || true 816 } 817 818 CMD=reset cache_pci_bus 819 820 mode=$1 821 822 if [ -z "$mode" ]; then 823 mode="config" 824 fi 825 826 : ${HUGEMEM:=2048} 827 : ${PCI_ALLOWED:=""} 828 : ${PCI_BLOCKED:=""} 829 830 if [ -n "$NVME_ALLOWED" ]; then 831 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 832 fi 833 834 if [ -n "$SKIP_PCI" ]; then 835 PCI_ALLOWED="none" 836 fi 837 838 if [ -z "$TARGET_USER" ]; then 839 TARGET_USER="$SUDO_USER" 840 if [ -z "$TARGET_USER" ]; then 841 TARGET_USER=$(logname 2> /dev/null) || true 842 fi 843 fi 844 845 collect_devices "$mode" 846 847 if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 848 # Note that this will wait only for the first block device attached to 849 # a given storage controller. For nvme this may miss some of the devs 850 # in case multiple namespaces are being in place. 851 # FIXME: Wait for nvme controller(s) to be in live state and determine 852 # number of configured namespaces, build list of potential block devs 853 # and pass them to sync_dev_uevents. Is it worth the effort? 854 bdfs_to_wait_for=() 855 for bdf in "${!all_devices_d[@]}"; do 856 ((all_devices_d["$bdf"] == 0)) || continue 857 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 858 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 859 bdfs_to_wait_for+=("$bdf") 860 fi 861 done 862 if ((${#bdfs_to_wait_for[@]} > 0)); then 863 echo "Waiting for block devices as requested" 864 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 865 "$rootdir/scripts/sync_dev_uevents.sh" \ 866 block/disk \ 867 "${bdfs_to_wait_for[@]}" & 868 sync_pid=$! 869 fi 870 fi 871 872 if [[ $os == Linux ]]; then 873 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 874 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 875 unset -v HUGEPGSZ 876 fi 877 878 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 879 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 880 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 881 882 if [ "$mode" == "config" ]; then 883 configure_linux 884 elif [ "$mode" == "cleanup" ]; then 885 cleanup_linux 886 clear_hugepages 887 elif [ "$mode" == "reset" ]; then 888 reset_linux 889 elif [ "$mode" == "status" ]; then 890 status_linux 891 elif [ "$mode" == "help" ]; then 892 usage $0 893 else 894 usage $0 "Invalid argument '$mode'" 895 fi 896 else 897 if [ "$mode" == "config" ]; then 898 configure_freebsd 899 elif [ "$mode" == "reset" ]; then 900 reset_freebsd 901 elif [ "$mode" == "cleanup" ]; then 902 echo "setup.sh cleanup function not yet supported on $os" 903 elif [ "$mode" == "status" ]; then 904 status_freebsd 905 elif [ "$mode" == "help" ]; then 906 usage $0 907 else 908 usage $0 "Invalid argument '$mode'" 909 fi 910 fi 911 912 if [[ -e /proc/$sync_pid/status ]]; then 913 wait "$sync_pid" 914 fi 915