1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2016 Intel Corporation 4# All rights reserved. 5# 6set -e 7shopt -s nullglob extglob 8 9os=$(uname -s) 10 11if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14fi 15 16rootdir=$(readlink -f $(dirname $0))/.. 17source "$rootdir/scripts/common.sh" 18 19function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|interactive|help]" 22 else 23 options="[config|reset|interactive|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "interactive Executes script in interactive mode." 48 echo "help Print this help message." 49 echo 50 echo "The following environment variables can be specified." 51 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 52 echo " For NUMA systems, the hugepages will be distributed on node0 by" 53 echo " default." 54 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 55 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 56 echo " Uses kernel's default for hugepages size." 57 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 58 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 59 echo " separated with comas. By default, NRHUGE will be applied on each node." 60 echo " Hugepages can be defined per node with e.g.:" 61 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 62 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 63 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 64 echo " setting is used." 65 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 66 echo " number of requested hugepages is lower from what's already" 67 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 68 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 69 echo " be made prior to allocation". 70 echo "PCI_ALLOWED" 71 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 72 echo " Each device must be specified as a full PCI address." 73 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 74 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 75 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 76 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 77 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 78 echo " will be bound." 79 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 80 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 81 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 82 echo " By default the current user will be used." 83 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 84 echo " bind devices to the given driver." 85 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 86 echo "PCI_BLOCK_SYNC_ON_RESET" 87 echo " If set in the environment, the attempt to wait for block devices associated" 88 echo " with given PCI device will be made upon reset" 89 echo "UNBIND_ENTIRE_IOMMU_GROUP" 90 echo " If set, all devices from nvme's iommu group will be unbound from their drivers." 91 echo " Use with caution." 92 exit 0 93} 94 95# In monolithic kernels the lsmod won't work. So 96# back that with a /sys/modules. We also check 97# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 98# contain needed info (like in Fedora-like OS). 99function check_for_driver() { 100 if [[ -z $1 ]]; then 101 return 0 102 fi 103 104 if lsmod | grep -q ${1//-/_}; then 105 return 1 106 fi 107 108 if [[ -d /sys/module/${1} || -d \ 109 /sys/module/${1//-/_} || -d \ 110 /sys/bus/pci/drivers/${1} || -d \ 111 /sys/bus/pci/drivers/${1//-/_} ]]; then 112 return 2 113 fi 114 return 0 115} 116 117function check_for_driver_freebsd() { 118 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 119 local search_paths path driver 120 IFS=";" read -ra search_paths < <(kldconfig -rU) 121 122 for driver in contigmem.ko nic_uio.ko; do 123 for path in "${search_paths[@]}"; do 124 [[ -f $path/$driver ]] && continue 2 125 done 126 return 1 127 done 128 return 0 129} 130 131function pci_dev_echo() { 132 local bdf="$1" 133 shift 134 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 135} 136 137function probe_driver() { 138 local bdf=$1 139 local driver_name=$2 140 old_driver_name=${drivers_d["$bdf"]:-no driver} 141 142 if [[ $driver_name == "$old_driver_name" ]]; then 143 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 144 return 0 145 fi 146 147 if [[ $old_driver_name != "no driver" ]]; then 148 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 149 fi 150 151 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 152 153 if [[ $driver_name == "none" ]]; then 154 return 0 155 fi 156 157 local probe_attempts=0 158 echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override" 159 while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do 160 pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)" 161 sleep 0.5 162 done 2> /dev/null 163 164 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 165 166 if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then 167 pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting" 168 return 1 169 fi 170} 171 172function linux_bind_driver() { 173 local bdf="$1" 174 local driver_name="$2" 175 176 probe_driver "$bdf" "$driver_name" 177 178 local iommu_group=${pci_iommu_groups["$bdf"]} 179 if [ -e "/dev/vfio/$iommu_group" ]; then 180 if [ -n "$TARGET_USER" ]; then 181 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 182 fi 183 fi 184 185 local iommug=("${!iommu_groups[iommu_group]}") 186 local _bdf _driver 187 if ((${#iommug[@]} > 1)) && [[ $driver_name == vfio* ]]; then 188 pci_dev_echo "$bdf" "WARNING: detected multiple devices (${#iommug[@]}) under the same IOMMU group!" 189 for _bdf in "${iommug[@]}"; do 190 [[ $_bdf == "$bdf" ]] && continue 191 _driver=$(readlink -f "/sys/bus/pci/devices/$_bdf/driver") && _driver=${_driver##*/} 192 if [[ $_driver == "$driver_name" ]]; then 193 continue 194 fi 195 # See what DPDK considers to be a "viable" iommu group: dpdk/lib/eal/linux/eal_vfio.c -> rte_vfio_setup_device() 196 pci_dev_echo "$bdf" "WARNING: ${_bdf##*/} not bound to $driver_name (${_driver##*/})" 197 pci_dev_echo "$bdf" "WARNING All devices in the IOMMU group must be bound to the same driver or unbound" 198 if [[ $UNBIND_ENTIRE_IOMMU_GROUP == yes ]]; then 199 pci_dev_echo "$bdf" "WARNING: Attempting to unbind ${_bdf##*/}" 200 drivers_d["${_bdf##*/}"]=${_driver##*/} 201 probe_driver "${_bdf##*/}" none 202 fi 203 done 204 fi 205 206} 207 208function linux_unbind_driver() { 209 local bdf="$1" 210 local old_driver_name=${drivers_d["$bdf"]:-no driver} 211 212 if [[ $old_driver_name == "no driver" ]]; then 213 pci_dev_echo "$bdf" "Not bound to any driver" 214 return 0 215 fi 216 217 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 218 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 219 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 220 fi 221 222 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 223} 224 225function linux_hugetlbfs_mounts() { 226 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 227} 228 229function get_block_dev_from_bdf() { 230 local bdf=$1 231 local block blocks=() ctrl 232 233 for block in /sys/block/*; do 234 if [[ $block == *nvme* ]]; then 235 ctrl=$(readlink -f "$block/device") ctrl=${ctrl##*/} 236 if [[ -e /sys/class/nvme/$ctrl && $(< "/sys/class/nvme/$ctrl/address") == "$bdf" ]]; then 237 blocks+=("${block##*/}") 238 fi 239 elif [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 240 blocks+=("${block##*/}") 241 fi 242 done 243 printf '%s\n' "${blocks[@]}" 244} 245 246function get_used_bdf_block_devs() { 247 local bdf=$1 248 local blocks block blockp dev mount holder 249 local used 250 251 hash lsblk &> /dev/null || return 1 252 blocks=($(get_block_dev_from_bdf "$bdf")) 253 254 for block in "${blocks[@]}"; do 255 # Check if the device is hold by some other, regardless if it's mounted 256 # or not. 257 for holder in "/sys/class/block/$block"*/holders/*; do 258 [[ -e $holder ]] || continue 259 blockp=${holder%/holders*} blockp=${blockp##*/} 260 if [[ -e $holder/slaves/$blockp ]]; then 261 used+=("holder@$blockp:${holder##*/}") 262 fi 263 done 264 while read -r dev mount; do 265 if [[ -e $mount ]]; then 266 used+=("mount@$block:$dev") 267 fi 268 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 269 if ((${#used[@]} == 0)); then 270 # Make sure we check if there's any valid data present on the target device 271 # regardless if it's being actively used or not. This is mainly done to make 272 # sure we don't miss more complex setups like ZFS pools, etc. 273 if block_in_use "$block" > /dev/null; then 274 used+=("data@$block") 275 fi 276 fi 277 done 278 279 if ((${#used[@]} > 0)); then 280 printf '%s\n' "${used[@]}" 281 fi 282} 283 284is_nvme_behind_vmd() { 285 local nvme_bdf=$1 dev_path 286 287 IFS="/" read -ra dev_path < <(readlink -f "/sys/bus/pci/devices/$nvme_bdf") 288 289 for dev in "${dev_path[@]}"; do 290 [[ -n $dev && -n ${vmd_d["$dev"]} ]] && echo $dev && return 0 291 done 292 return 1 293} 294 295is_nvme_iommu_shared_with_vmd() { 296 local nvme_bdf=$1 vmd 297 298 # This use-case is quite specific to vfio-pci|iommu setup 299 is_iommu_enabled || return 1 300 301 [[ -n ${nvme_vmd_d["$nvme_bdf"]} ]] || return 1 302 # nvme is behind VMD ... 303 ((pci_iommu_groups["$nvme_bdf"] == pci_iommu_groups["${nvme_vmd_d["$nvme_bdf"]}"])) || return 1 304 # ... and it shares iommu_group with it 305} 306 307function collect_devices() { 308 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 309 310 local ids dev_type dev_id bdf bdfs in_use driver _vmd 311 312 ids+="PCI_DEVICE_ID_INTEL_IOAT" 313 ids+="|PCI_DEVICE_ID_INTEL_DSA" 314 ids+="|PCI_DEVICE_ID_INTEL_IAA" 315 ids+="|PCI_DEVICE_ID_VIRTIO" 316 ids+="|PCI_DEVICE_ID_INTEL_VMD" 317 ids+="|SPDK_PCI_CLASS_NVME" 318 319 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d types_d all_devices_type_d nvme_vmd_d 320 321 while read -r _ dev_type dev_id; do 322 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 323 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 324 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 325 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 326 types_d["$dev_type"]=1 327 for bdf in "${bdfs[@]}"; do 328 in_use=0 329 if [[ $1 != status ]]; then 330 if ! pci_can_use "$bdf"; then 331 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 332 in_use=1 333 fi 334 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 335 if ! verify_bdf_block_devs "$bdf"; then 336 in_use=1 337 fi 338 fi 339 if [[ $dev_type == vmd ]]; then 340 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 341 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 342 in_use=1 343 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 344 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 345 if [ "$mode" == "config" ]; then 346 cat <<- MESSAGE 347 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 348 which are attached to the kernel NVMe driver,the binding process may go faster 349 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 350 NVMe SSDs, and then run again to unbind the VMD devices." 351 MESSAGE 352 fi 353 fi 354 fi 355 fi 356 fi 357 eval "${dev_type}_d[$bdf]=$in_use" 358 all_devices_d["$bdf"]=$in_use 359 all_devices_type_d["$bdf"]=$dev_type 360 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 361 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 362 drivers_d["$bdf"]=${driver##*/} 363 else 364 drivers_d["$bdf"]="" 365 fi 366 done 367 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 368 369 for bdf in "${!nvme_d[@]}"; do 370 _vmd=$(is_nvme_behind_vmd "$bdf") && nvme_vmd_d["$bdf"]=$_vmd 371 done 372 373 # Check if we got any nvmes attached to VMDs sharing the same iommu_group - if there are 374 # any skip them since they won't be usable by SPDK without moving the entire VMD ctrl 375 # away from the kernel first. That said, allow to touch the nvmes in case user requested 376 # all devices to be unbound from any driver or if dedicated override flag was set. 377 [[ -z $ALLOW_NVME_BEHIND_VMD && $DRIVER_OVERRIDE != none ]] || return 0 378 379 for bdf in "${!nvme_d[@]}"; do 380 is_nvme_iommu_shared_with_vmd "$bdf" || continue 381 nvme_d["$bdf"]=1 all_devices_d["$bdf"]=1 382 pci_dev_echo "$bdf" "Skipping nvme behind VMD (${nvme_vmd_d["$bdf"]})" 383 done 384 385 return 0 386} 387 388function collect_driver() { 389 local bdf=$1 390 local drivers driver 391 392 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 393 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 394 # Pick first entry in case multiple aliases are bound to a driver. 395 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 396 driver=${driver##*/} 397 else 398 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 399 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 400 [[ -n ${dsa_d["$bdf"]} ]] && driver=idxd 401 [[ -n ${iaa_d["$bdf"]} ]] && driver=idxd 402 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 403 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 404 fi 2> /dev/null 405 echo "$driver" 406} 407 408function verify_bdf_block_devs() { 409 local bdf=$1 410 local blknames 411 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 412 413 if ((${#blknames[@]} > 0)); then 414 local IFS="," 415 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 416 return 1 417 fi 418} 419 420function configure_linux_pci() { 421 local driver_path="" 422 driver_name="" 423 igb_uio_fallback="" 424 425 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 426 # igb_uio is a common driver to override with and it depends on uio. 427 modprobe uio || true 428 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 429 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 430 fi 431 fi 432 433 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 434 driver_name=none 435 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 436 driver_path="$DRIVER_OVERRIDE" 437 driver_name="${DRIVER_OVERRIDE##*/}" 438 # modprobe and the sysfs don't use the .ko suffix. 439 driver_name=${driver_name%.ko} 440 # path = name -> there is no path 441 if [[ "$driver_path" = "$driver_name" ]]; then 442 driver_path="" 443 fi 444 elif is_iommu_enabled; then 445 driver_name=vfio-pci 446 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 447 # should be done automatically by modprobe since this particular module should 448 # be a part of vfio-pci dependencies, however, on some distros, it seems that 449 # it's not the case. See #1689. 450 if modinfo vfio_iommu_type1 > /dev/null; then 451 modprobe vfio_iommu_type1 452 fi 453 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 454 driver_name=uio_pci_generic 455 elif [[ -e $igb_uio_fallback ]]; then 456 driver_path="$igb_uio_fallback" 457 driver_name="igb_uio" 458 echo "WARNING: uio_pci_generic not detected - using $driver_name" 459 else 460 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 461 return 1 462 fi 463 464 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 465 if [[ $driver_name != "none" ]]; then 466 if [[ -n "$driver_path" ]]; then 467 insmod $driver_path || true 468 else 469 modprobe $driver_name 470 fi 471 fi 472 473 for bdf in "${!all_devices_d[@]}"; do 474 if ((all_devices_d["$bdf"] == 0)); then 475 if [[ -n ${nvme_d["$bdf"]} ]]; then 476 # Some nvme controllers may take significant amount of time while being 477 # unbound from the driver. Put that task into background to speed up the 478 # whole process. Currently this is done only for the devices bound to the 479 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 480 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 481 linux_bind_driver "$bdf" "$driver_name" & 482 else 483 linux_bind_driver "$bdf" "$driver_name" 484 fi 485 fi 486 done 487 wait 488 489 echo "1" > "/sys/bus/pci/rescan" 490} 491 492function cleanup_linux() { 493 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 494 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 495 496 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 497 if [[ -d $XDG_RUNTIME_DIR ]]; then 498 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 499 fi 500 501 for dir in "${dirs_to_clean[@]}"; do 502 files_to_clean+=("$dir/"*) 503 done 504 file_locks+=(/var/tmp/spdk_pci_lock*) 505 file_locks+=(/var/tmp/spdk_cpu_lock*) 506 507 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 508 files_to_clean+=("${file_locks[@]}") 509 510 # This may fail in case path that readlink attempts to resolve suddenly 511 # disappears (as it may happen with terminating processes). 512 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 513 514 if ((${#opened_files[@]} == 0)); then 515 echo "Can't get list of opened files!" 516 exit 1 517 fi 518 519 echo 'Cleaning' 520 for f in "${files_to_clean[@]}"; do 521 [[ -e $f ]] || continue 522 if [[ ${opened_files[*]} != *"$f"* ]]; then 523 echo "Removing: $f" 524 rm $f 525 else 526 echo "Still open: $f" 527 fi 528 done 529 530 for dir in "${dirs_to_clean[@]}"; do 531 [[ -d $dir ]] || continue 532 if [[ ${opened_files[*]} != *"$dir"* ]]; then 533 echo "Removing: $dir" 534 rmdir $dir 535 else 536 echo "Still open: $dir" 537 fi 538 done 539 echo "Clean" 540} 541 542check_hugepages_alloc() { 543 local hp_int=$1 544 local allocated_hugepages 545 546 allocated_hugepages=$(< "$hp_int") 547 548 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 549 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 550 return 0 551 fi 552 553 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 554 555 allocated_hugepages=$(< "$hp_int") 556 if ((allocated_hugepages < NRHUGE)); then 557 cat <<- ERROR 558 559 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 560 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 561 ERROR 562 return 1 563 fi 564} 565 566clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 567 568configure_linux_hugepages() { 569 local node system_nodes 570 local nodes_to_use nodes_hp 571 572 if [[ $CLEAR_HUGE == yes ]]; then 573 clear_hugepages 574 fi 575 576 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 577 clear_hugepages 578 check_hugepages_alloc /proc/sys/vm/nr_hugepages 579 return 0 580 fi 581 582 for node in /sys/devices/system/node/node*; do 583 [[ -e $node ]] || continue 584 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 585 done 586 587 if ((${#nodes[@]} == 0)); then 588 # No NUMA support? Fallback to common interface 589 check_hugepages_alloc /proc/sys/vm/nr_hugepages 590 return 0 591 fi 592 593 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 594 if ((${#nodes_to_use[@]} == 0)); then 595 nodes_to_use[0]=0 596 fi 597 598 # Align indexes with node ids 599 for node in "${!nodes_to_use[@]}"; do 600 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 601 eval "${nodes_to_use[node]}" 602 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 603 nodes_hp[nodes_to_use[node]]=$NRHUGE 604 fi 605 done 606 607 for node in "${!nodes_hp[@]}"; do 608 if [[ -z ${nodes[node]} ]]; then 609 echo "Node $node doesn't exist, ignoring" >&2 610 continue 611 fi 612 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 613 done 614} 615 616function configure_linux() { 617 configure_linux_pci 618 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 619 620 if [ -z "$hugetlbfs_mounts" ]; then 621 hugetlbfs_mounts=/mnt/huge 622 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 623 mkdir -p "$hugetlbfs_mounts" 624 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 625 fi 626 627 configure_linux_hugepages 628 629 if [ "$driver_name" = "vfio-pci" ]; then 630 if [ -n "$TARGET_USER" ]; then 631 for mount in $hugetlbfs_mounts; do 632 chown "$TARGET_USER" "$mount" 633 chmod g+w "$mount" 634 done 635 636 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 637 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 638 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 639 cat <<- MEMLOCK 640 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 641 642 This is the maximum amount of memory you will be 643 able to use with DPDK and VFIO if run as user "$TARGET_USER". 644 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 645 MEMLOCK 646 if ((MEMLOCK_AMNT < 65536)); then 647 echo "" 648 echo "## WARNING: memlock limit is less than 64MB" 649 echo -n "## DPDK with VFIO may not be able to initialize " 650 echo "if run as user \"$TARGET_USER\"." 651 fi 652 fi 653 fi 654 fi 655 656 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 657 # Some distros build msr as a module. Make sure it's loaded to ensure 658 # DPDK can easily figure out the TSC rate rather than relying on 100ms 659 # sleeps. 660 modprobe msr &> /dev/null || true 661 fi 662} 663 664function reset_linux_pci() { 665 # virtio 666 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 667 # Requires some more investigation - for example, some kernels do not seem to have 668 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 669 # underscore vs. dash right in the virtio_scsi name. 670 modprobe virtio-pci || true 671 for bdf in "${!all_devices_d[@]}"; do 672 ((all_devices_d["$bdf"] == 0)) || continue 673 674 driver=$(collect_driver "$bdf") 675 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 676 linux_bind_driver "$bdf" "$driver" 677 else 678 linux_unbind_driver "$bdf" 679 fi 680 done 681 682 echo "1" > "/sys/bus/pci/rescan" 683} 684 685function reset_linux() { 686 reset_linux_pci 687 for mount in $(linux_hugetlbfs_mounts); do 688 for hp in "$mount"/spdk*map_*; do 689 flock -n "$hp" true && rm -f "$hp" 690 done 691 done 692 rm -f /run/.spdk* 693} 694 695function status_linux() { 696 echo "Hugepages" >&2 697 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 698 699 numa_nodes=0 700 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 701 numa_nodes=$((numa_nodes + 1)) 702 free_pages=$(cat $path/free_hugepages) 703 all_pages=$(cat $path/nr_hugepages) 704 705 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 706 707 node=${BASH_REMATCH[1]} 708 huge_size=${BASH_REMATCH[2]} 709 710 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 711 done 712 713 # fall back to system-wide hugepages 714 if [ "$numa_nodes" = "0" ]; then 715 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 716 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 717 node="-" 718 huge_size="$HUGEPGSZ" 719 720 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 721 fi 722 723 printf '\n%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 724 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 725 726 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 727 728 for bdf in "${sorted_bdfs[@]}"; do 729 driver=${drivers_d["$bdf"]} 730 if [ "$numa_nodes" = "0" ]; then 731 node="-" 732 else 733 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 734 if ((node == -1)); then 735 node=unknown 736 fi 737 fi 738 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 739 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 740 else 741 name="-" 742 fi 743 744 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 745 blknames=($(get_block_dev_from_bdf "$bdf")) 746 else 747 blknames=("-") 748 fi 749 750 desc="" 751 desc=${desc:-${nvme_d["$bdf"]:+NVMe${nvme_vmd_d["$bdf"]:+@${nvme_vmd_d["$bdf"]}(VMD)}}} 752 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 753 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 754 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 755 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 756 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 757 758 printf '%-25s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 759 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 760 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 761 done 762} 763 764function status_freebsd() { 765 local pci 766 767 status_print() ( 768 local type=$1 769 local dev driver 770 771 shift 772 773 for pci; do 774 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 775 "$type" \ 776 "$pci" \ 777 "${pci_ids_vendor["$pci"]}" \ 778 "${pci_ids_device["$pci"]}" \ 779 "${pci_bus_driver["$pci"]}" 780 done | sort -k2,2 781 ) 782 783 local contigmem=present 784 local contigmem_buffer_size 785 local contigmem_num_buffers 786 787 if ! kldstat -q -m contigmem; then 788 contigmem="not present" 789 fi 790 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 791 contigmem_buffer_size="not set" 792 fi 793 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 794 contigmem_num_buffers="not set" 795 fi 796 797 cat <<- BSD_INFO 798 Contigmem ($contigmem) 799 Buffer Size: $contigmem_buffer_size 800 Num Buffers: $contigmem_num_buffers 801 802 BSD_INFO 803 804 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 805 "Type" "BDF" "Vendor" "Device" "Driver" >&2 806 807 status_print "NVMe" "${!nvme_d[@]}" 808 status_print "I/OAT" "${!ioat_d[@]}" 809 status_print "DSA" "${!dsa_d[@]}" 810 status_print "IAA" "${!iaa_d[@]}" 811 status_print "VMD" "${!vmd_d[@]}" 812} 813 814function configure_freebsd_pci() { 815 local BDFS 816 817 BDFS+=("${!nvme_d[@]}") 818 BDFS+=("${!ioat_d[@]}") 819 BDFS+=("${!dsa_d[@]}") 820 BDFS+=("${!iaa_d[@]}") 821 BDFS+=("${!vmd_d[@]}") 822 823 # Drop the domain part from all the addresses 824 BDFS=("${BDFS[@]#*:}") 825 826 local IFS="," 827 kldunload nic_uio.ko || true 828 kenv hw.nic_uio.bdfs="${BDFS[*]}" 829 kldload nic_uio.ko 830} 831 832function configure_freebsd() { 833 if ! check_for_driver_freebsd; then 834 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 835 return 1 836 fi 837 configure_freebsd_pci 838 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 839 # previous value, unload contigmem so that we can reload with the new value. 840 if kldstat -q -m contigmem; then 841 # contigmem may be loaded, but the kernel environment doesn't have to 842 # be necessarily set at this point. If it isn't, kenv will fail to 843 # pick up the hw. options. Handle it. 844 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 845 contigmem_num_buffers=-1 846 fi 2> /dev/null 847 if ((contigmem_num_buffers != HUGEMEM / 256)); then 848 kldunload contigmem.ko 849 fi 850 fi 851 if ! kldstat -q -m contigmem; then 852 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 853 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 854 kldload contigmem.ko 855 fi 856} 857 858function reset_freebsd() { 859 kldunload contigmem.ko || true 860 kldunload nic_uio.ko || true 861} 862 863function set_hp() { 864 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 865 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 866 unset -v HUGEPGSZ 867 fi 868 869 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 870 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 871 NRHUGE=${NRHUGE:-$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 872} 873 874CMD=reset cache_pci_bus 875 876mode=$1 877 878if [ -z "$mode" ]; then 879 mode="config" 880fi 881 882: ${HUGEMEM:=2048} 883: ${PCI_ALLOWED:=""} 884: ${PCI_BLOCKED:=""} 885 886if [ -n "$NVME_ALLOWED" ]; then 887 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 888fi 889 890if [ -n "$SKIP_PCI" ]; then 891 PCI_ALLOWED="none" 892fi 893 894if [ -z "$TARGET_USER" ]; then 895 TARGET_USER="$SUDO_USER" 896 if [ -z "$TARGET_USER" ]; then 897 TARGET_USER=$(logname 2> /dev/null) || true 898 fi 899fi 900 901collect_devices "$mode" 902 903if [[ $os == Linux ]]; then 904 set_hp 905fi 906 907if [[ $mode == interactive ]]; then 908 source "$rootdir/scripts/common/setup/interactive.sh" 909 main_menu "$2" || exit 0 910fi 911 912if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 913 # Note that this will wait only for the first block device attached to 914 # a given storage controller. For nvme this may miss some of the devs 915 # in case multiple namespaces are being in place. 916 # FIXME: Wait for nvme controller(s) to be in live state and determine 917 # number of configured namespaces, build list of potential block devs 918 # and pass them to sync_dev_uevents. Is it worth the effort? 919 bdfs_to_wait_for=() 920 for bdf in "${!all_devices_d[@]}"; do 921 ((all_devices_d["$bdf"] == 0)) || continue 922 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 923 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 924 bdfs_to_wait_for+=("$bdf") 925 fi 926 done 927 if ((${#bdfs_to_wait_for[@]} > 0)); then 928 echo "Waiting for block devices as requested" 929 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 930 "$rootdir/scripts/sync_dev_uevents.sh" \ 931 block/disk \ 932 "${bdfs_to_wait_for[@]}" & 933 sync_pid=$! 934 fi 935fi 936 937if [[ $os == Linux ]]; then 938 if [ "$mode" == "config" ]; then 939 configure_linux 940 elif [ "$mode" == "cleanup" ]; then 941 cleanup_linux 942 clear_hugepages 943 elif [ "$mode" == "reset" ]; then 944 reset_linux 945 elif [ "$mode" == "status" ]; then 946 status_linux 947 elif [ "$mode" == "help" ]; then 948 usage $0 949 else 950 usage $0 "Invalid argument '$mode'" 951 fi 952else 953 if [ "$mode" == "config" ]; then 954 configure_freebsd 955 elif [ "$mode" == "reset" ]; then 956 reset_freebsd 957 elif [ "$mode" == "cleanup" ]; then 958 echo "setup.sh cleanup function not yet supported on $os" 959 elif [ "$mode" == "status" ]; then 960 status_freebsd 961 elif [ "$mode" == "help" ]; then 962 usage $0 963 else 964 usage $0 "Invalid argument '$mode'" 965 fi 966fi 967 968if [[ -e /proc/$sync_pid/status ]]; then 969 wait "$sync_pid" 970fi 971