1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2016 Intel Corporation 4# All rights reserved. 5# 6set -e 7shopt -s nullglob extglob 8 9os=$(uname -s) 10 11if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14fi 15 16rootdir=$(readlink -f $(dirname $0))/.. 17source "$rootdir/scripts/common.sh" 18 19function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|help]" 22 else 23 options="[config|reset|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "help Print this help message." 48 echo 49 echo "The following environment variables can be specified." 50 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 51 echo " For NUMA systems, the hugepages will be distributed on node0 by" 52 echo " default." 53 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 54 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 55 echo " Uses kernel's default for hugepages size." 56 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 57 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 58 echo " separated with comas. By default, NRHUGE will be applied on each node." 59 echo " Hugepages can be defined per node with e.g.:" 60 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 61 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 62 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 63 echo " setting is used." 64 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 65 echo " number of requested hugepages is lower from what's already" 66 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 67 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 68 echo " be made prior to allocation". 69 echo "PCI_ALLOWED" 70 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 71 echo " Each device must be specified as a full PCI address." 72 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 73 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 74 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 75 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 76 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 77 echo " will be bound." 78 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 79 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 80 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 81 echo " By default the current user will be used." 82 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 83 echo " bind devices to the given driver." 84 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 85 echo "PCI_BLOCK_SYNC_ON_RESET" 86 echo " If set in the environment, the attempt to wait for block devices associated" 87 echo " with given PCI device will be made upon reset" 88 exit 0 89} 90 91# In monolithic kernels the lsmod won't work. So 92# back that with a /sys/modules. We also check 93# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 94# contain needed info (like in Fedora-like OS). 95function check_for_driver() { 96 if [[ -z $1 ]]; then 97 return 0 98 fi 99 100 if lsmod | grep -q ${1//-/_}; then 101 return 1 102 fi 103 104 if [[ -d /sys/module/${1} || -d \ 105 /sys/module/${1//-/_} || -d \ 106 /sys/bus/pci/drivers/${1} || -d \ 107 /sys/bus/pci/drivers/${1//-/_} ]]; then 108 return 2 109 fi 110 return 0 111} 112 113function check_for_driver_freebsd() { 114 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 115 local search_paths path driver 116 IFS=";" read -ra search_paths < <(kldconfig -rU) 117 118 for driver in contigmem.ko nic_uio.ko; do 119 for path in "${search_paths[@]}"; do 120 [[ -f $path/$driver ]] && continue 2 121 done 122 return 1 123 done 124 return 0 125} 126 127function pci_dev_echo() { 128 local bdf="$1" 129 shift 130 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 131} 132 133function linux_bind_driver() { 134 bdf="$1" 135 driver_name="$2" 136 old_driver_name=${drivers_d["$bdf"]:-no driver} 137 138 if [[ $driver_name == "$old_driver_name" ]]; then 139 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 140 return 0 141 fi 142 143 if [[ $old_driver_name != "no driver" ]]; then 144 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 145 fi 146 147 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 148 149 if [[ $driver_name == "none" ]]; then 150 return 0 151 fi 152 153 local probe_attempts=0 154 echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override" 155 while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do 156 pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)" 157 sleep 0.5 158 done 2> /dev/null 159 160 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 161 162 if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then 163 pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting" 164 return 1 165 fi 166 167 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 168 if [ -e "/dev/vfio/$iommu_group" ]; then 169 if [ -n "$TARGET_USER" ]; then 170 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 171 fi 172 fi 173} 174 175function linux_unbind_driver() { 176 local bdf="$1" 177 local old_driver_name=${drivers_d["$bdf"]:-no driver} 178 179 if [[ $old_driver_name == "no driver" ]]; then 180 pci_dev_echo "$bdf" "Not bound to any driver" 181 return 0 182 fi 183 184 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 185 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 186 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 187 fi 188 189 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 190} 191 192function linux_hugetlbfs_mounts() { 193 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 194} 195 196function get_block_dev_from_bdf() { 197 local bdf=$1 198 local block blocks=() ctrl 199 200 for block in /sys/block/*; do 201 if [[ $block == *nvme* ]]; then 202 ctrl=${block##*/} ctrl=${ctrl%n*} 203 if [[ -e /sys/class/nvme/$ctrl && $(< "/sys/class/nvme/$ctrl/address") == "$bdf" ]]; then 204 blocks+=("${block##*/}") 205 fi 206 elif [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 207 blocks+=("${block##*/}") 208 fi 209 done 210 printf '%s\n' "${blocks[@]}" 211} 212 213function get_used_bdf_block_devs() { 214 local bdf=$1 215 local blocks block blockp dev mount holder 216 local used 217 218 hash lsblk &> /dev/null || return 1 219 blocks=($(get_block_dev_from_bdf "$bdf")) 220 221 for block in "${blocks[@]}"; do 222 # Check if the device is hold by some other, regardless if it's mounted 223 # or not. 224 for holder in "/sys/class/block/$block"*/holders/*; do 225 [[ -e $holder ]] || continue 226 blockp=${holder%/holders*} blockp=${blockp##*/} 227 if [[ -e $holder/slaves/$blockp ]]; then 228 used+=("holder@$blockp:${holder##*/}") 229 fi 230 done 231 while read -r dev mount; do 232 if [[ -e $mount ]]; then 233 used+=("mount@$block:$dev") 234 fi 235 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 236 if ((${#used[@]} == 0)); then 237 # Make sure we check if there's any valid data present on the target device 238 # regardless if it's being actively used or not. This is mainly done to make 239 # sure we don't miss more complex setups like ZFS pools, etc. 240 if block_in_use "$block" > /dev/null; then 241 used+=("data@$block") 242 fi 243 fi 244 done 245 246 if ((${#used[@]} > 0)); then 247 printf '%s\n' "${used[@]}" 248 fi 249} 250 251function collect_devices() { 252 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 253 254 local ids dev_type dev_id bdf bdfs in_use driver 255 256 ids+="PCI_DEVICE_ID_INTEL_IOAT" 257 ids+="|PCI_DEVICE_ID_INTEL_DSA" 258 ids+="|PCI_DEVICE_ID_INTEL_IAA" 259 ids+="|PCI_DEVICE_ID_VIRTIO" 260 ids+="|PCI_DEVICE_ID_INTEL_VMD" 261 ids+="|SPDK_PCI_CLASS_NVME" 262 263 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d 264 265 while read -r _ dev_type dev_id; do 266 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 267 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 268 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 269 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 270 for bdf in "${bdfs[@]}"; do 271 in_use=0 272 if [[ $1 != status ]]; then 273 if ! pci_can_use "$bdf"; then 274 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 275 in_use=1 276 fi 277 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 278 if ! verify_bdf_block_devs "$bdf"; then 279 in_use=1 280 fi 281 fi 282 if [[ $dev_type == vmd ]]; then 283 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 284 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 285 in_use=1 286 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 287 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 288 if [ "$mode" == "config" ]; then 289 cat <<- MESSAGE 290 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 291 which are attached to the kernel NVMe driver,the binding process may go faster 292 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 293 NVMe SSDs, and then run again to unbind the VMD devices." 294 MESSAGE 295 fi 296 fi 297 fi 298 fi 299 fi 300 eval "${dev_type}_d[$bdf]=$in_use" 301 all_devices_d["$bdf"]=$in_use 302 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 303 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 304 drivers_d["$bdf"]=${driver##*/} 305 fi 306 done 307 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 308} 309 310function collect_driver() { 311 local bdf=$1 312 local drivers driver 313 314 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 315 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 316 # Pick first entry in case multiple aliases are bound to a driver. 317 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 318 driver=${driver##*/} 319 else 320 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 321 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 322 [[ -n ${dsa_d["$bdf"]} ]] && driver=dsa 323 [[ -n ${iaa_d["$bdf"]} ]] && driver=iaa 324 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 325 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 326 fi 2> /dev/null 327 echo "$driver" 328} 329 330function verify_bdf_block_devs() { 331 local bdf=$1 332 local blknames 333 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 334 335 if ((${#blknames[@]} > 0)); then 336 local IFS="," 337 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 338 return 1 339 fi 340} 341 342function configure_linux_pci() { 343 local driver_path="" 344 driver_name="" 345 igb_uio_fallback="" 346 347 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 348 # igb_uio is a common driver to override with and it depends on uio. 349 modprobe uio || true 350 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 351 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 352 fi 353 fi 354 355 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 356 driver_name=none 357 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 358 driver_path="$DRIVER_OVERRIDE" 359 driver_name="${DRIVER_OVERRIDE##*/}" 360 # modprobe and the sysfs don't use the .ko suffix. 361 driver_name=${driver_name%.ko} 362 # path = name -> there is no path 363 if [[ "$driver_path" = "$driver_name" ]]; then 364 driver_path="" 365 fi 366 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 367 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 368 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 369 driver_name=vfio-pci 370 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 371 # should be done automatically by modprobe since this particular module should 372 # be a part of vfio-pci dependencies, however, on some distros, it seems that 373 # it's not the case. See #1689. 374 if modinfo vfio_iommu_type1 > /dev/null; then 375 modprobe vfio_iommu_type1 376 fi 377 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 378 driver_name=uio_pci_generic 379 elif [[ -e $igb_uio_fallback ]]; then 380 driver_path="$igb_uio_fallback" 381 driver_name="igb_uio" 382 echo "WARNING: uio_pci_generic not detected - using $driver_name" 383 else 384 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 385 return 1 386 fi 387 388 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 389 if [[ $driver_name != "none" ]]; then 390 if [[ -n "$driver_path" ]]; then 391 insmod $driver_path || true 392 else 393 modprobe $driver_name 394 fi 395 fi 396 397 for bdf in "${!all_devices_d[@]}"; do 398 if ((all_devices_d["$bdf"] == 0)); then 399 if [[ -n ${nvme_d["$bdf"]} ]]; then 400 # Some nvme controllers may take significant amount of time while being 401 # unbound from the driver. Put that task into background to speed up the 402 # whole process. Currently this is done only for the devices bound to the 403 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 404 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 405 linux_bind_driver "$bdf" "$driver_name" & 406 else 407 linux_bind_driver "$bdf" "$driver_name" 408 fi 409 fi 410 done 411 wait 412 413 echo "1" > "/sys/bus/pci/rescan" 414} 415 416function cleanup_linux() { 417 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 418 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 419 420 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 421 if [[ -d $XDG_RUNTIME_DIR ]]; then 422 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 423 fi 424 425 for dir in "${dirs_to_clean[@]}"; do 426 files_to_clean+=("$dir/"*) 427 done 428 file_locks+=(/var/tmp/spdk_pci_lock*) 429 file_locks+=(/var/tmp/spdk_cpu_lock*) 430 431 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 432 files_to_clean+=("${file_locks[@]}") 433 434 # This may fail in case path that readlink attempts to resolve suddenly 435 # disappears (as it may happen with terminating processes). 436 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 437 438 if ((${#opened_files[@]} == 0)); then 439 echo "Can't get list of opened files!" 440 exit 1 441 fi 442 443 echo 'Cleaning' 444 for f in "${files_to_clean[@]}"; do 445 [[ -e $f ]] || continue 446 if [[ ${opened_files[*]} != *"$f"* ]]; then 447 echo "Removing: $f" 448 rm $f 449 else 450 echo "Still open: $f" 451 fi 452 done 453 454 for dir in "${dirs_to_clean[@]}"; do 455 [[ -d $dir ]] || continue 456 if [[ ${opened_files[*]} != *"$dir"* ]]; then 457 echo "Removing: $dir" 458 rmdir $dir 459 else 460 echo "Still open: $dir" 461 fi 462 done 463 echo "Clean" 464} 465 466check_hugepages_alloc() { 467 local hp_int=$1 468 local allocated_hugepages 469 470 allocated_hugepages=$(< "$hp_int") 471 472 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 473 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 474 return 0 475 fi 476 477 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 478 479 allocated_hugepages=$(< "$hp_int") 480 if ((allocated_hugepages < NRHUGE)); then 481 cat <<- ERROR 482 483 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 484 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 485 ERROR 486 return 1 487 fi 488} 489 490clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 491 492configure_linux_hugepages() { 493 local node system_nodes 494 local nodes_to_use nodes_hp 495 496 if [[ $CLEAR_HUGE == yes ]]; then 497 clear_hugepages 498 fi 499 500 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 501 clear_hugepages 502 check_hugepages_alloc /proc/sys/vm/nr_hugepages 503 return 0 504 fi 505 506 for node in /sys/devices/system/node/node*; do 507 [[ -e $node ]] || continue 508 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 509 done 510 511 if ((${#nodes[@]} == 0)); then 512 # No NUMA support? Fallback to common interface 513 check_hugepages_alloc /proc/sys/vm/nr_hugepages 514 return 0 515 fi 516 517 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 518 if ((${#nodes_to_use[@]} == 0)); then 519 nodes_to_use[0]=0 520 fi 521 522 # Align indexes with node ids 523 for node in "${!nodes_to_use[@]}"; do 524 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 525 eval "${nodes_to_use[node]}" 526 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 527 nodes_hp[nodes_to_use[node]]=$NRHUGE 528 fi 529 done 530 531 for node in "${!nodes_hp[@]}"; do 532 if [[ -z ${nodes[node]} ]]; then 533 echo "Node $node doesn't exist, ignoring" >&2 534 continue 535 fi 536 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 537 done 538} 539 540function configure_linux() { 541 configure_linux_pci 542 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 543 544 if [ -z "$hugetlbfs_mounts" ]; then 545 hugetlbfs_mounts=/mnt/huge 546 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 547 mkdir -p "$hugetlbfs_mounts" 548 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 549 fi 550 551 configure_linux_hugepages 552 553 if [ "$driver_name" = "vfio-pci" ]; then 554 if [ -n "$TARGET_USER" ]; then 555 for mount in $hugetlbfs_mounts; do 556 chown "$TARGET_USER" "$mount" 557 chmod g+w "$mount" 558 done 559 560 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 561 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 562 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 563 cat <<- MEMLOCK 564 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 565 566 This is the maximum amount of memory you will be 567 able to use with DPDK and VFIO if run as user "$TARGET_USER". 568 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 569 MEMLOCK 570 if ((MEMLOCK_AMNT < 65536)); then 571 echo "" 572 echo "## WARNING: memlock limit is less than 64MB" 573 echo -n "## DPDK with VFIO may not be able to initialize " 574 echo "if run as user \"$TARGET_USER\"." 575 fi 576 fi 577 fi 578 fi 579 580 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 581 # Some distros build msr as a module. Make sure it's loaded to ensure 582 # DPDK can easily figure out the TSC rate rather than relying on 100ms 583 # sleeps. 584 modprobe msr &> /dev/null || true 585 fi 586} 587 588function reset_linux_pci() { 589 # virtio 590 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 591 # Requires some more investigation - for example, some kernels do not seem to have 592 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 593 # underscore vs. dash right in the virtio_scsi name. 594 modprobe virtio-pci || true 595 for bdf in "${!all_devices_d[@]}"; do 596 ((all_devices_d["$bdf"] == 0)) || continue 597 598 driver=$(collect_driver "$bdf") 599 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 600 linux_bind_driver "$bdf" "$driver" 601 else 602 linux_unbind_driver "$bdf" 603 fi 604 done 605 606 echo "1" > "/sys/bus/pci/rescan" 607} 608 609function reset_linux() { 610 reset_linux_pci 611 for mount in $(linux_hugetlbfs_mounts); do 612 for hp in "$mount"/spdk*map_*; do 613 flock -n "$hp" true && rm -f "$hp" 614 done 615 done 616 rm -f /run/.spdk* 617} 618 619function status_linux() { 620 echo "Hugepages" >&2 621 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 622 623 numa_nodes=0 624 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 625 numa_nodes=$((numa_nodes + 1)) 626 free_pages=$(cat $path/free_hugepages) 627 all_pages=$(cat $path/nr_hugepages) 628 629 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 630 631 node=${BASH_REMATCH[1]} 632 huge_size=${BASH_REMATCH[2]} 633 634 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 635 done 636 637 # fall back to system-wide hugepages 638 if [ "$numa_nodes" = "0" ]; then 639 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 640 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 641 node="-" 642 huge_size="$HUGEPGSZ" 643 644 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 645 fi 646 647 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 648 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 649 650 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 651 652 for bdf in "${sorted_bdfs[@]}"; do 653 driver=${drivers_d["$bdf"]} 654 if [ "$numa_nodes" = "0" ]; then 655 node="-" 656 else 657 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 658 if ((node == -1)); then 659 node=unknown 660 fi 661 fi 662 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 663 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 664 else 665 name="-" 666 fi 667 668 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 669 blknames=($(get_block_dev_from_bdf "$bdf")) 670 else 671 blknames=("-") 672 fi 673 674 desc="" 675 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 676 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 677 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 678 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 679 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 680 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 681 682 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 683 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 684 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 685 done 686} 687 688function status_freebsd() { 689 local pci 690 691 status_print() ( 692 local type=$1 693 local dev driver 694 695 shift 696 697 for pci; do 698 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 699 "$type" \ 700 "$pci" \ 701 "${pci_ids_vendor["$pci"]}" \ 702 "${pci_ids_device["$pci"]}" \ 703 "${pci_bus_driver["$pci"]}" 704 done | sort -k2,2 705 ) 706 707 local contigmem=present 708 local contigmem_buffer_size 709 local contigmem_num_buffers 710 711 if ! kldstat -q -m contigmem; then 712 contigmem="not present" 713 fi 714 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 715 contigmem_buffer_size="not set" 716 fi 717 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 718 contigmem_num_buffers="not set" 719 fi 720 721 cat <<- BSD_INFO 722 Contigmem ($contigmem) 723 Buffer Size: $contigmem_buffer_size 724 Num Buffers: $contigmem_num_buffers 725 726 BSD_INFO 727 728 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 729 "Type" "BDF" "Vendor" "Device" "Driver" >&2 730 731 status_print "NVMe" "${!nvme_d[@]}" 732 status_print "I/OAT" "${!ioat_d[@]}" 733 status_print "DSA" "${!dsa_d[@]}" 734 status_print "IAA" "${!iaa_d[@]}" 735 status_print "VMD" "${!vmd_d[@]}" 736} 737 738function configure_freebsd_pci() { 739 local BDFS 740 741 BDFS+=("${!nvme_d[@]}") 742 BDFS+=("${!ioat_d[@]}") 743 BDFS+=("${!dsa_d[@]}") 744 BDFS+=("${!iaa_d[@]}") 745 BDFS+=("${!vmd_d[@]}") 746 747 # Drop the domain part from all the addresses 748 BDFS=("${BDFS[@]#*:}") 749 750 local IFS="," 751 kldunload nic_uio.ko || true 752 kenv hw.nic_uio.bdfs="${BDFS[*]}" 753 kldload nic_uio.ko 754} 755 756function configure_freebsd() { 757 if ! check_for_driver_freebsd; then 758 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 759 return 1 760 fi 761 configure_freebsd_pci 762 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 763 # previous value, unload contigmem so that we can reload with the new value. 764 if kldstat -q -m contigmem; then 765 # contigmem may be loaded, but the kernel environment doesn't have to 766 # be necessarily set at this point. If it isn't, kenv will fail to 767 # pick up the hw. options. Handle it. 768 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 769 contigmem_num_buffers=-1 770 fi 2> /dev/null 771 if ((contigmem_num_buffers != HUGEMEM / 256)); then 772 kldunload contigmem.ko 773 fi 774 fi 775 if ! kldstat -q -m contigmem; then 776 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 777 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 778 kldload contigmem.ko 779 fi 780} 781 782function reset_freebsd() { 783 kldunload contigmem.ko || true 784 kldunload nic_uio.ko || true 785} 786 787CMD=reset cache_pci_bus 788 789mode=$1 790 791if [ -z "$mode" ]; then 792 mode="config" 793fi 794 795: ${HUGEMEM:=2048} 796: ${PCI_ALLOWED:=""} 797: ${PCI_BLOCKED:=""} 798 799if [ -n "$NVME_ALLOWED" ]; then 800 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 801fi 802 803if [ -n "$SKIP_PCI" ]; then 804 PCI_ALLOWED="none" 805fi 806 807if [ -z "$TARGET_USER" ]; then 808 TARGET_USER="$SUDO_USER" 809 if [ -z "$TARGET_USER" ]; then 810 TARGET_USER=$(logname 2> /dev/null) || true 811 fi 812fi 813 814collect_devices "$mode" 815 816if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 817 # Note that this will wait only for the first block device attached to 818 # a given storage controller. For nvme this may miss some of the devs 819 # in case multiple namespaces are being in place. 820 # FIXME: Wait for nvme controller(s) to be in live state and determine 821 # number of configured namespaces, build list of potential block devs 822 # and pass them to sync_dev_uevents. Is it worth the effort? 823 bdfs_to_wait_for=() 824 for bdf in "${!all_devices_d[@]}"; do 825 ((all_devices_d["$bdf"] == 0)) || continue 826 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 827 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 828 bdfs_to_wait_for+=("$bdf") 829 fi 830 done 831 if ((${#bdfs_to_wait_for[@]} > 0)); then 832 echo "Waiting for block devices as requested" 833 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 834 "$rootdir/scripts/sync_dev_uevents.sh" \ 835 block/disk \ 836 "${bdfs_to_wait_for[@]}" & 837 sync_pid=$! 838 fi 839fi 840 841if [[ $os == Linux ]]; then 842 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 843 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 844 unset -v HUGEPGSZ 845 fi 846 847 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 848 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 849 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 850 851 if [ "$mode" == "config" ]; then 852 configure_linux 853 elif [ "$mode" == "cleanup" ]; then 854 cleanup_linux 855 clear_hugepages 856 elif [ "$mode" == "reset" ]; then 857 reset_linux 858 elif [ "$mode" == "status" ]; then 859 status_linux 860 elif [ "$mode" == "help" ]; then 861 usage $0 862 else 863 usage $0 "Invalid argument '$mode'" 864 fi 865else 866 if [ "$mode" == "config" ]; then 867 configure_freebsd 868 elif [ "$mode" == "reset" ]; then 869 reset_freebsd 870 elif [ "$mode" == "cleanup" ]; then 871 echo "setup.sh cleanup function not yet supported on $os" 872 elif [ "$mode" == "status" ]; then 873 status_freebsd 874 elif [ "$mode" == "help" ]; then 875 usage $0 876 else 877 usage $0 "Invalid argument '$mode'" 878 fi 879fi 880 881if [[ -e /proc/$sync_pid/status ]]; then 882 wait "$sync_pid" 883fi 884