1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2016 Intel Corporation 4# All rights reserved. 5# 6set -e 7shopt -s nullglob extglob 8 9os=$(uname -s) 10 11if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14fi 15 16rootdir=$(readlink -f $(dirname $0))/.. 17source "$rootdir/scripts/common.sh" 18 19function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|help]" 22 else 23 options="[config|reset|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "help Print this help message." 48 echo 49 echo "The following environment variables can be specified." 50 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 51 echo " For NUMA systems, the hugepages will be distributed on node0 by" 52 echo " default." 53 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 54 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 55 echo " Uses kernel's default for hugepages size." 56 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 57 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 58 echo " separated with comas. By default, NRHUGE will be applied on each node." 59 echo " Hugepages can be defined per node with e.g.:" 60 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 61 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 62 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 63 echo " setting is used." 64 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 65 echo " number of requested hugepages is lower from what's already" 66 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 67 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 68 echo " be made prior to allocation". 69 echo "PCI_ALLOWED" 70 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 71 echo " Each device must be specified as a full PCI address." 72 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 73 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 74 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 75 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 76 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 77 echo " will be bound." 78 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 79 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 80 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 81 echo " By default the current user will be used." 82 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 83 echo " bind devices to the given driver." 84 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 85 echo "PCI_BLOCK_SYNC_ON_RESET" 86 echo " If set in the environment, the attempt to wait for block devices associated" 87 echo " with given PCI device will be made upon reset" 88 exit 0 89} 90 91# In monolithic kernels the lsmod won't work. So 92# back that with a /sys/modules. We also check 93# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 94# contain needed info (like in Fedora-like OS). 95function check_for_driver() { 96 if [[ -z $1 ]]; then 97 return 0 98 fi 99 100 if lsmod | grep -q ${1//-/_}; then 101 return 1 102 fi 103 104 if [[ -d /sys/module/${1} || -d \ 105 /sys/module/${1//-/_} || -d \ 106 /sys/bus/pci/drivers/${1} || -d \ 107 /sys/bus/pci/drivers/${1//-/_} ]]; then 108 return 2 109 fi 110 return 0 111} 112 113function check_for_driver_freebsd() { 114 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 115 local search_paths path driver 116 IFS=";" read -ra search_paths < <(kldconfig -rU) 117 118 for driver in contigmem.ko nic_uio.ko; do 119 for path in "${search_paths[@]}"; do 120 [[ -f $path/$driver ]] && continue 2 121 done 122 return 1 123 done 124 return 0 125} 126 127function pci_dev_echo() { 128 local bdf="$1" 129 shift 130 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 131} 132 133function linux_bind_driver() { 134 bdf="$1" 135 driver_name="$2" 136 old_driver_name=${drivers_d["$bdf"]:-no driver} 137 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 138 139 if [[ $driver_name == "$old_driver_name" ]]; then 140 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 141 return 0 142 fi 143 144 if [[ $old_driver_name != "no driver" ]]; then 145 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 146 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 147 fi 148 149 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 150 151 if [[ $driver_name == "none" ]]; then 152 return 0 153 fi 154 155 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 156 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 157 158 if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then 159 # Check if the uio_pci_generic driver is broken as it might be in 160 # some 4.18.x kernels (see centos8 for instance) - if our device 161 # didn't get a proper uio entry, fallback to igb_uio 162 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 163 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 164 drivers_d["$bdf"]="no driver" 165 # This call will override $driver_name for remaining devices as well 166 linux_bind_driver "$bdf" igb_uio 167 fi 168 fi 169 170 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 171 if [ -e "/dev/vfio/$iommu_group" ]; then 172 if [ -n "$TARGET_USER" ]; then 173 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 174 fi 175 fi 176} 177 178function linux_unbind_driver() { 179 local bdf="$1" 180 local ven_dev_id 181 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 182 local old_driver_name=${drivers_d["$bdf"]:-no driver} 183 184 if [[ $old_driver_name == "no driver" ]]; then 185 pci_dev_echo "$bdf" "Not bound to any driver" 186 return 0 187 fi 188 189 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 190 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 191 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 192 fi 193 194 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 195} 196 197function linux_hugetlbfs_mounts() { 198 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 199} 200 201function get_block_dev_from_bdf() { 202 local bdf=$1 203 local block 204 205 for block in /sys/block/*; do 206 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 207 echo "${block##*/}" 208 fi 209 done 210} 211 212function get_used_bdf_block_devs() { 213 local bdf=$1 214 local blocks block blockp dev mount holder 215 local used 216 217 hash lsblk &> /dev/null || return 1 218 blocks=($(get_block_dev_from_bdf "$bdf")) 219 220 for block in "${blocks[@]}"; do 221 # Check if the device is hold by some other, regardless if it's mounted 222 # or not. 223 for holder in "/sys/class/block/$block"*/holders/*; do 224 [[ -e $holder ]] || continue 225 blockp=${holder%/holders*} blockp=${blockp##*/} 226 if [[ -e $holder/slaves/$blockp ]]; then 227 used+=("holder@$blockp:${holder##*/}") 228 fi 229 done 230 while read -r dev mount; do 231 if [[ -e $mount ]]; then 232 used+=("mount@$block:$dev") 233 fi 234 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 235 if ((${#used[@]} == 0)); then 236 # Make sure we check if there's any valid data present on the target device 237 # regardless if it's being actively used or not. This is mainly done to make 238 # sure we don't miss more complex setups like ZFS pools, etc. 239 if block_in_use "$block" > /dev/null; then 240 used+=("data@$block") 241 fi 242 fi 243 done 244 245 if ((${#used[@]} > 0)); then 246 printf '%s\n' "${used[@]}" 247 fi 248} 249 250function collect_devices() { 251 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 252 253 local ids dev_type dev_id bdf bdfs in_use driver 254 255 ids+="PCI_DEVICE_ID_INTEL_IOAT" 256 ids+="|PCI_DEVICE_ID_INTEL_DSA" 257 ids+="|PCI_DEVICE_ID_INTEL_IAA" 258 ids+="|PCI_DEVICE_ID_VIRTIO" 259 ids+="|PCI_DEVICE_ID_INTEL_VMD" 260 ids+="|SPDK_PCI_CLASS_NVME" 261 262 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d 263 264 while read -r _ dev_type dev_id; do 265 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 266 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 267 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 268 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 269 for bdf in "${bdfs[@]}"; do 270 in_use=0 271 if [[ $1 != status ]]; then 272 if ! pci_can_use "$bdf"; then 273 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 274 in_use=1 275 fi 276 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 277 if ! verify_bdf_block_devs "$bdf"; then 278 in_use=1 279 fi 280 fi 281 if [[ $dev_type == vmd ]]; then 282 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 283 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 284 in_use=1 285 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 286 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 287 if [ "$mode" == "config" ]; then 288 cat <<- MESSAGE 289 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 290 which are attached to the kernel NVMe driver,the binding process may go faster 291 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 292 NVMe SSDs, and then run again to unbind the VMD devices." 293 MESSAGE 294 fi 295 fi 296 fi 297 fi 298 fi 299 eval "${dev_type}_d[$bdf]=$in_use" 300 all_devices_d["$bdf"]=$in_use 301 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 302 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 303 drivers_d["$bdf"]=${driver##*/} 304 fi 305 done 306 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 307} 308 309function collect_driver() { 310 local bdf=$1 311 local drivers driver 312 313 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 314 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 315 # Pick first entry in case multiple aliases are bound to a driver. 316 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 317 driver=${driver##*/} 318 else 319 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 320 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 321 [[ -n ${dsa_d["$bdf"]} ]] && driver=dsa 322 [[ -n ${iaa_d["$bdf"]} ]] && driver=iaa 323 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 324 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 325 fi 2> /dev/null 326 echo "$driver" 327} 328 329function verify_bdf_block_devs() { 330 local bdf=$1 331 local blknames 332 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 333 334 if ((${#blknames[@]} > 0)); then 335 local IFS="," 336 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 337 return 1 338 fi 339} 340 341function configure_linux_pci() { 342 local driver_path="" 343 driver_name="" 344 igb_uio_fallback="" 345 346 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 347 # igb_uio is a common driver to override with and it depends on uio. 348 modprobe uio || true 349 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 350 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 351 fi 352 fi 353 354 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 355 driver_name=none 356 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 357 driver_path="$DRIVER_OVERRIDE" 358 driver_name="${DRIVER_OVERRIDE##*/}" 359 # modprobe and the sysfs don't use the .ko suffix. 360 driver_name=${driver_name%.ko} 361 # path = name -> there is no path 362 if [[ "$driver_path" = "$driver_name" ]]; then 363 driver_path="" 364 fi 365 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 366 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 367 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 368 driver_name=vfio-pci 369 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 370 # should be done automatically by modprobe since this particular module should 371 # be a part of vfio-pci dependencies, however, on some distros, it seems that 372 # it's not the case. See #1689. 373 if modinfo vfio_iommu_type1 > /dev/null; then 374 modprobe vfio_iommu_type1 375 fi 376 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 377 driver_name=uio_pci_generic 378 elif [[ -e $igb_uio_fallback ]]; then 379 driver_path="$igb_uio_fallback" 380 driver_name="igb_uio" 381 echo "WARNING: uio_pci_generic not detected - using $driver_name" 382 else 383 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 384 return 1 385 fi 386 387 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 388 if [[ $driver_name != "none" ]]; then 389 if [[ -n "$driver_path" ]]; then 390 insmod $driver_path || true 391 else 392 modprobe $driver_name 393 fi 394 fi 395 396 for bdf in "${!all_devices_d[@]}"; do 397 if ((all_devices_d["$bdf"] == 0)); then 398 if [[ -n ${nvme_d["$bdf"]} ]]; then 399 # Some nvme controllers may take significant amount of time while being 400 # unbound from the driver. Put that task into background to speed up the 401 # whole process. Currently this is done only for the devices bound to the 402 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 403 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 404 linux_bind_driver "$bdf" "$driver_name" & 405 else 406 linux_bind_driver "$bdf" "$driver_name" 407 fi 408 fi 409 done 410 wait 411 412 echo "1" > "/sys/bus/pci/rescan" 413} 414 415function cleanup_linux() { 416 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 417 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 418 419 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 420 if [[ -d $XDG_RUNTIME_DIR ]]; then 421 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 422 fi 423 424 for dir in "${dirs_to_clean[@]}"; do 425 files_to_clean+=("$dir/"*) 426 done 427 file_locks+=(/var/tmp/spdk_pci_lock*) 428 file_locks+=(/var/tmp/spdk_cpu_lock*) 429 430 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 431 files_to_clean+=("${file_locks[@]}") 432 433 # This may fail in case path that readlink attempts to resolve suddenly 434 # disappears (as it may happen with terminating processes). 435 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 436 437 if ((${#opened_files[@]} == 0)); then 438 echo "Can't get list of opened files!" 439 exit 1 440 fi 441 442 echo 'Cleaning' 443 for f in "${files_to_clean[@]}"; do 444 [[ -e $f ]] || continue 445 if [[ ${opened_files[*]} != *"$f"* ]]; then 446 echo "Removing: $f" 447 rm $f 448 else 449 echo "Still open: $f" 450 fi 451 done 452 453 for dir in "${dirs_to_clean[@]}"; do 454 [[ -d $dir ]] || continue 455 if [[ ${opened_files[*]} != *"$dir"* ]]; then 456 echo "Removing: $dir" 457 rmdir $dir 458 else 459 echo "Still open: $dir" 460 fi 461 done 462 echo "Clean" 463} 464 465check_hugepages_alloc() { 466 local hp_int=$1 467 local allocated_hugepages 468 469 allocated_hugepages=$(< "$hp_int") 470 471 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 472 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 473 return 0 474 fi 475 476 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 477 478 allocated_hugepages=$(< "$hp_int") 479 if ((allocated_hugepages < NRHUGE)); then 480 cat <<- ERROR 481 482 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 483 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 484 ERROR 485 return 1 486 fi 487} 488 489clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 490 491configure_linux_hugepages() { 492 local node system_nodes 493 local nodes_to_use nodes_hp 494 495 if [[ $CLEAR_HUGE == yes ]]; then 496 clear_hugepages 497 fi 498 499 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 500 clear_hugepages 501 check_hugepages_alloc /proc/sys/vm/nr_hugepages 502 return 0 503 fi 504 505 for node in /sys/devices/system/node/node*; do 506 [[ -e $node ]] || continue 507 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 508 done 509 510 if ((${#nodes[@]} == 0)); then 511 # No NUMA support? Fallback to common interface 512 check_hugepages_alloc /proc/sys/vm/nr_hugepages 513 return 0 514 fi 515 516 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 517 if ((${#nodes_to_use[@]} == 0)); then 518 nodes_to_use[0]=0 519 fi 520 521 # Align indexes with node ids 522 for node in "${!nodes_to_use[@]}"; do 523 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 524 eval "${nodes_to_use[node]}" 525 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 526 nodes_hp[nodes_to_use[node]]=$NRHUGE 527 fi 528 done 529 530 for node in "${!nodes_hp[@]}"; do 531 if [[ -z ${nodes[node]} ]]; then 532 echo "Node $node doesn't exist, ignoring" >&2 533 continue 534 fi 535 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 536 done 537} 538 539function configure_linux() { 540 configure_linux_pci 541 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 542 543 if [ -z "$hugetlbfs_mounts" ]; then 544 hugetlbfs_mounts=/mnt/huge 545 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 546 mkdir -p "$hugetlbfs_mounts" 547 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 548 fi 549 550 configure_linux_hugepages 551 552 if [ "$driver_name" = "vfio-pci" ]; then 553 if [ -n "$TARGET_USER" ]; then 554 for mount in $hugetlbfs_mounts; do 555 chown "$TARGET_USER" "$mount" 556 chmod g+w "$mount" 557 done 558 559 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 560 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 561 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 562 cat <<- MEMLOCK 563 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 564 565 This is the maximum amount of memory you will be 566 able to use with DPDK and VFIO if run as user "$TARGET_USER". 567 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 568 MEMLOCK 569 if ((MEMLOCK_AMNT < 65536)); then 570 echo "" 571 echo "## WARNING: memlock limit is less than 64MB" 572 echo -n "## DPDK with VFIO may not be able to initialize " 573 echo "if run as user \"$TARGET_USER\"." 574 fi 575 fi 576 fi 577 fi 578 579 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 580 # Some distros build msr as a module. Make sure it's loaded to ensure 581 # DPDK can easily figure out the TSC rate rather than relying on 100ms 582 # sleeps. 583 modprobe msr &> /dev/null || true 584 fi 585} 586 587function reset_linux_pci() { 588 # virtio 589 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 590 # Requires some more investigation - for example, some kernels do not seem to have 591 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 592 # underscore vs. dash right in the virtio_scsi name. 593 modprobe virtio-pci || true 594 for bdf in "${!all_devices_d[@]}"; do 595 ((all_devices_d["$bdf"] == 0)) || continue 596 597 driver=$(collect_driver "$bdf") 598 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 599 linux_bind_driver "$bdf" "$driver" 600 else 601 linux_unbind_driver "$bdf" 602 fi 603 done 604 605 echo "1" > "/sys/bus/pci/rescan" 606} 607 608function reset_linux() { 609 reset_linux_pci 610 for mount in $(linux_hugetlbfs_mounts); do 611 for hp in "$mount"/spdk*map_*; do 612 flock -n "$hp" true && rm -f "$hp" 613 done 614 done 615 rm -f /run/.spdk* 616} 617 618function status_linux() { 619 echo "Hugepages" >&2 620 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 621 622 numa_nodes=0 623 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 624 numa_nodes=$((numa_nodes + 1)) 625 free_pages=$(cat $path/free_hugepages) 626 all_pages=$(cat $path/nr_hugepages) 627 628 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 629 630 node=${BASH_REMATCH[1]} 631 huge_size=${BASH_REMATCH[2]} 632 633 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 634 done 635 636 # fall back to system-wide hugepages 637 if [ "$numa_nodes" = "0" ]; then 638 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 639 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 640 node="-" 641 huge_size="$HUGEPGSZ" 642 643 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 644 fi 645 646 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 647 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 648 649 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 650 651 for bdf in "${sorted_bdfs[@]}"; do 652 driver=${drivers_d["$bdf"]} 653 if [ "$numa_nodes" = "0" ]; then 654 node="-" 655 else 656 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 657 if ((node == -1)); then 658 node=unknown 659 fi 660 fi 661 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 662 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 663 else 664 name="-" 665 fi 666 667 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 668 blknames=($(get_block_dev_from_bdf "$bdf")) 669 else 670 blknames=("-") 671 fi 672 673 desc="" 674 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 675 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 676 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 677 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 678 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 679 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 680 681 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 682 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 683 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 684 done 685} 686 687function status_freebsd() { 688 local pci 689 690 status_print() ( 691 local type=$1 692 local dev driver 693 694 shift 695 696 for pci; do 697 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 698 "$type" \ 699 "$pci" \ 700 "${pci_ids_vendor["$pci"]}" \ 701 "${pci_ids_device["$pci"]}" \ 702 "${pci_bus_driver["$pci"]}" 703 done | sort -k2,2 704 ) 705 706 local contigmem=present 707 local contigmem_buffer_size 708 local contigmem_num_buffers 709 710 if ! kldstat -q -m contigmem; then 711 contigmem="not present" 712 fi 713 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 714 contigmem_buffer_size="not set" 715 fi 716 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 717 contigmem_num_buffers="not set" 718 fi 719 720 cat <<- BSD_INFO 721 Contigmem ($contigmem) 722 Buffer Size: $contigmem_buffer_size 723 Num Buffers: $contigmem_num_buffers 724 725 BSD_INFO 726 727 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 728 "Type" "BDF" "Vendor" "Device" "Driver" >&2 729 730 status_print "NVMe" "${!nvme_d[@]}" 731 status_print "I/OAT" "${!ioat_d[@]}" 732 status_print "DSA" "${!dsa_d[@]}" 733 status_print "IAA" "${!iaa_d[@]}" 734 status_print "VMD" "${!vmd_d[@]}" 735} 736 737function configure_freebsd_pci() { 738 local BDFS 739 740 BDFS+=("${!nvme_d[@]}") 741 BDFS+=("${!ioat_d[@]}") 742 BDFS+=("${!dsa_d[@]}") 743 BDFS+=("${!iaa_d[@]}") 744 BDFS+=("${!vmd_d[@]}") 745 746 # Drop the domain part from all the addresses 747 BDFS=("${BDFS[@]#*:}") 748 749 local IFS="," 750 kldunload nic_uio.ko || true 751 kenv hw.nic_uio.bdfs="${BDFS[*]}" 752 kldload nic_uio.ko 753} 754 755function configure_freebsd() { 756 if ! check_for_driver_freebsd; then 757 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 758 return 1 759 fi 760 configure_freebsd_pci 761 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 762 # previous value, unload contigmem so that we can reload with the new value. 763 if kldstat -q -m contigmem; then 764 # contigmem may be loaded, but the kernel environment doesn't have to 765 # be necessarily set at this point. If it isn't, kenv will fail to 766 # pick up the hw. options. Handle it. 767 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 768 contigmem_num_buffers=-1 769 fi 2> /dev/null 770 if ((contigmem_num_buffers != HUGEMEM / 256)); then 771 kldunload contigmem.ko 772 fi 773 fi 774 if ! kldstat -q -m contigmem; then 775 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 776 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 777 kldload contigmem.ko 778 fi 779} 780 781function reset_freebsd() { 782 kldunload contigmem.ko || true 783 kldunload nic_uio.ko || true 784} 785 786CMD=reset cache_pci_bus 787 788mode=$1 789 790if [ -z "$mode" ]; then 791 mode="config" 792fi 793 794: ${HUGEMEM:=2048} 795: ${PCI_ALLOWED:=""} 796: ${PCI_BLOCKED:=""} 797 798if [ -n "$NVME_ALLOWED" ]; then 799 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 800fi 801 802if [ -n "$SKIP_PCI" ]; then 803 PCI_ALLOWED="none" 804fi 805 806if [ -z "$TARGET_USER" ]; then 807 TARGET_USER="$SUDO_USER" 808 if [ -z "$TARGET_USER" ]; then 809 TARGET_USER=$(logname 2> /dev/null) || true 810 fi 811fi 812 813collect_devices "$mode" 814 815if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 816 # Note that this will wait only for the first block device attached to 817 # a given storage controller. For nvme this may miss some of the devs 818 # in case multiple namespaces are being in place. 819 # FIXME: Wait for nvme controller(s) to be in live state and determine 820 # number of configured namespaces, build list of potential block devs 821 # and pass them to sync_dev_uevents. Is it worth the effort? 822 bdfs_to_wait_for=() 823 for bdf in "${!all_devices_d[@]}"; do 824 ((all_devices_d["$bdf"] == 0)) || continue 825 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 826 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 827 bdfs_to_wait_for+=("$bdf") 828 fi 829 done 830 if ((${#bdfs_to_wait_for[@]} > 0)); then 831 echo "Waiting for block devices as requested" 832 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 833 "$rootdir/scripts/sync_dev_uevents.sh" \ 834 block/disk \ 835 "${bdfs_to_wait_for[@]}" & 836 sync_pid=$! 837 fi 838fi 839 840if [[ $os == Linux ]]; then 841 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 842 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 843 unset -v HUGEPGSZ 844 fi 845 846 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 847 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 848 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 849 850 if [ "$mode" == "config" ]; then 851 configure_linux 852 elif [ "$mode" == "cleanup" ]; then 853 cleanup_linux 854 clear_hugepages 855 elif [ "$mode" == "reset" ]; then 856 reset_linux 857 elif [ "$mode" == "status" ]; then 858 status_linux 859 elif [ "$mode" == "help" ]; then 860 usage $0 861 else 862 usage $0 "Invalid argument '$mode'" 863 fi 864else 865 if [ "$mode" == "config" ]; then 866 configure_freebsd 867 elif [ "$mode" == "reset" ]; then 868 reset_freebsd 869 elif [ "$mode" == "cleanup" ]; then 870 echo "setup.sh cleanup function not yet supported on $os" 871 elif [ "$mode" == "status" ]; then 872 status_freebsd 873 elif [ "$mode" == "help" ]; then 874 usage $0 875 else 876 usage $0 "Invalid argument '$mode'" 877 fi 878fi 879 880if [[ -e /proc/$sync_pid/status ]]; then 881 wait "$sync_pid" 882fi 883