1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2016 Intel Corporation 4# All rights reserved. 5# 6set -e 7shopt -s nullglob extglob 8 9os=$(uname -s) 10 11if [[ $os != Linux && $os != FreeBSD ]]; then 12 echo "Not supported platform ($os), aborting" 13 exit 1 14fi 15 16rootdir=$(readlink -f $(dirname $0))/.. 17source "$rootdir/scripts/common.sh" 18 19function usage() { 20 if [[ $os == Linux ]]; then 21 options="[config|reset|status|cleanup|help]" 22 else 23 options="[config|reset|help]" 24 fi 25 26 [[ -n $2 ]] && ( 27 echo "$2" 28 echo "" 29 ) 30 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 31 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 32 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 33 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 34 echo "Usage: $(basename $1) $options" 35 echo 36 echo "$options - as following:" 37 echo "config Default mode. Allocate hugepages and bind PCI devices." 38 if [[ $os == Linux ]]; then 39 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 40 fi 41 echo "reset Rebind PCI devices back to their original drivers." 42 echo " Also cleanup any leftover spdk files/resources." 43 echo " Hugepage memory size will remain unchanged." 44 if [[ $os == Linux ]]; then 45 echo "status Print status of all SPDK-compatible devices on the system." 46 fi 47 echo "help Print this help message." 48 echo 49 echo "The following environment variables can be specified." 50 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 51 echo " For NUMA systems, the hugepages will be distributed on node0 by" 52 echo " default." 53 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 54 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 55 echo " Uses kernel's default for hugepages size." 56 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 57 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 58 echo " separated with comas. By default, NRHUGE will be applied on each node." 59 echo " Hugepages can be defined per node with e.g.:" 60 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 61 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 62 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 63 echo " setting is used." 64 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 65 echo " number of requested hugepages is lower from what's already" 66 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 67 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 68 echo " be made prior to allocation". 69 echo "PCI_ALLOWED" 70 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 71 echo " Each device must be specified as a full PCI address." 72 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 73 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 74 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 75 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 76 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 77 echo " will be bound." 78 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 79 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 80 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 81 echo " By default the current user will be used." 82 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 83 echo " bind devices to the given driver." 84 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 85 echo "PCI_BLOCK_SYNC_ON_RESET" 86 echo " If set in the environment, the attempt to wait for block devices associated" 87 echo " with given PCI device will be made upon reset" 88 exit 0 89} 90 91# In monolithic kernels the lsmod won't work. So 92# back that with a /sys/modules. We also check 93# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 94# contain needed info (like in Fedora-like OS). 95function check_for_driver() { 96 if [[ -z $1 ]]; then 97 return 0 98 fi 99 100 if lsmod | grep -q ${1//-/_}; then 101 return 1 102 fi 103 104 if [[ -d /sys/module/${1} || -d \ 105 /sys/module/${1//-/_} || -d \ 106 /sys/bus/pci/drivers/${1} || -d \ 107 /sys/bus/pci/drivers/${1//-/_} ]]; then 108 return 2 109 fi 110 return 0 111} 112 113function check_for_driver_freebsd() { 114 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 115 local search_paths path driver 116 IFS=";" read -ra search_paths < <(kldconfig -rU) 117 118 for driver in contigmem.ko nic_uio.ko; do 119 for path in "${search_paths[@]}"; do 120 [[ -f $path/$driver ]] && continue 2 121 done 122 return 1 123 done 124 return 0 125} 126 127function pci_dev_echo() { 128 local bdf="$1" 129 shift 130 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 131} 132 133function linux_bind_driver() { 134 bdf="$1" 135 driver_name="$2" 136 old_driver_name=${drivers_d["$bdf"]:-no driver} 137 138 if [[ $driver_name == "$old_driver_name" ]]; then 139 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 140 return 0 141 fi 142 143 if [[ $old_driver_name != "no driver" ]]; then 144 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 145 fi 146 147 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 148 149 if [[ $driver_name == "none" ]]; then 150 return 0 151 fi 152 153 local probe_attempts=0 154 echo "$driver_name" > "/sys/bus/pci/devices/$bdf/driver_override" 155 while ! echo "$bdf" > "/sys/bus/pci/drivers_probe" && ((probe_attempts++ < 10)); do 156 pci_dev_echo "$bdf" "failed to bind to $driver_name, retrying ($probe_attempts)" 157 sleep 0.5 158 done 2> /dev/null 159 160 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 161 162 if [[ ! -e /sys/bus/pci/drivers/$driver_name/$bdf ]]; then 163 pci_dev_echo "$bdf" "failed to bind to $driver_name, aborting" 164 return 1 165 fi 166 167 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 168 if [ -e "/dev/vfio/$iommu_group" ]; then 169 if [ -n "$TARGET_USER" ]; then 170 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 171 fi 172 fi 173} 174 175function linux_unbind_driver() { 176 local bdf="$1" 177 local old_driver_name=${drivers_d["$bdf"]:-no driver} 178 179 if [[ $old_driver_name == "no driver" ]]; then 180 pci_dev_echo "$bdf" "Not bound to any driver" 181 return 0 182 fi 183 184 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 185 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 186 echo "" > "/sys/bus/pci/devices/$bdf/driver_override" 187 fi 188 189 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 190} 191 192function linux_hugetlbfs_mounts() { 193 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 194} 195 196function get_block_dev_from_bdf() { 197 local bdf=$1 198 local block 199 200 for block in /sys/block/*; do 201 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 202 echo "${block##*/}" 203 fi 204 done 205} 206 207function get_used_bdf_block_devs() { 208 local bdf=$1 209 local blocks block blockp dev mount holder 210 local used 211 212 hash lsblk &> /dev/null || return 1 213 blocks=($(get_block_dev_from_bdf "$bdf")) 214 215 for block in "${blocks[@]}"; do 216 # Check if the device is hold by some other, regardless if it's mounted 217 # or not. 218 for holder in "/sys/class/block/$block"*/holders/*; do 219 [[ -e $holder ]] || continue 220 blockp=${holder%/holders*} blockp=${blockp##*/} 221 if [[ -e $holder/slaves/$blockp ]]; then 222 used+=("holder@$blockp:${holder##*/}") 223 fi 224 done 225 while read -r dev mount; do 226 if [[ -e $mount ]]; then 227 used+=("mount@$block:$dev") 228 fi 229 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 230 if ((${#used[@]} == 0)); then 231 # Make sure we check if there's any valid data present on the target device 232 # regardless if it's being actively used or not. This is mainly done to make 233 # sure we don't miss more complex setups like ZFS pools, etc. 234 if block_in_use "$block" > /dev/null; then 235 used+=("data@$block") 236 fi 237 fi 238 done 239 240 if ((${#used[@]} > 0)); then 241 printf '%s\n' "${used[@]}" 242 fi 243} 244 245function collect_devices() { 246 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 247 248 local ids dev_type dev_id bdf bdfs in_use driver 249 250 ids+="PCI_DEVICE_ID_INTEL_IOAT" 251 ids+="|PCI_DEVICE_ID_INTEL_DSA" 252 ids+="|PCI_DEVICE_ID_INTEL_IAA" 253 ids+="|PCI_DEVICE_ID_VIRTIO" 254 ids+="|PCI_DEVICE_ID_INTEL_VMD" 255 ids+="|SPDK_PCI_CLASS_NVME" 256 257 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d 258 259 while read -r _ dev_type dev_id; do 260 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 261 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 262 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 263 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 264 for bdf in "${bdfs[@]}"; do 265 in_use=0 266 if [[ $1 != status ]]; then 267 if ! pci_can_use "$bdf"; then 268 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 269 in_use=1 270 fi 271 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 272 if ! verify_bdf_block_devs "$bdf"; then 273 in_use=1 274 fi 275 fi 276 if [[ $dev_type == vmd ]]; then 277 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 278 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 279 in_use=1 280 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 281 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 282 if [ "$mode" == "config" ]; then 283 cat <<- MESSAGE 284 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 285 which are attached to the kernel NVMe driver,the binding process may go faster 286 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 287 NVMe SSDs, and then run again to unbind the VMD devices." 288 MESSAGE 289 fi 290 fi 291 fi 292 fi 293 fi 294 eval "${dev_type}_d[$bdf]=$in_use" 295 all_devices_d["$bdf"]=$in_use 296 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 297 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 298 drivers_d["$bdf"]=${driver##*/} 299 fi 300 done 301 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 302} 303 304function collect_driver() { 305 local bdf=$1 306 local drivers driver 307 308 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 309 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 310 # Pick first entry in case multiple aliases are bound to a driver. 311 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 312 driver=${driver##*/} 313 else 314 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 315 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 316 [[ -n ${dsa_d["$bdf"]} ]] && driver=dsa 317 [[ -n ${iaa_d["$bdf"]} ]] && driver=iaa 318 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 319 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 320 fi 2> /dev/null 321 echo "$driver" 322} 323 324function verify_bdf_block_devs() { 325 local bdf=$1 326 local blknames 327 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 328 329 if ((${#blknames[@]} > 0)); then 330 local IFS="," 331 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 332 return 1 333 fi 334} 335 336function configure_linux_pci() { 337 local driver_path="" 338 driver_name="" 339 igb_uio_fallback="" 340 341 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 342 # igb_uio is a common driver to override with and it depends on uio. 343 modprobe uio || true 344 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 345 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 346 fi 347 fi 348 349 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 350 driver_name=none 351 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 352 driver_path="$DRIVER_OVERRIDE" 353 driver_name="${DRIVER_OVERRIDE##*/}" 354 # modprobe and the sysfs don't use the .ko suffix. 355 driver_name=${driver_name%.ko} 356 # path = name -> there is no path 357 if [[ "$driver_path" = "$driver_name" ]]; then 358 driver_path="" 359 fi 360 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 361 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 362 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 363 driver_name=vfio-pci 364 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 365 # should be done automatically by modprobe since this particular module should 366 # be a part of vfio-pci dependencies, however, on some distros, it seems that 367 # it's not the case. See #1689. 368 if modinfo vfio_iommu_type1 > /dev/null; then 369 modprobe vfio_iommu_type1 370 fi 371 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 372 driver_name=uio_pci_generic 373 elif [[ -e $igb_uio_fallback ]]; then 374 driver_path="$igb_uio_fallback" 375 driver_name="igb_uio" 376 echo "WARNING: uio_pci_generic not detected - using $driver_name" 377 else 378 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 379 return 1 380 fi 381 382 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 383 if [[ $driver_name != "none" ]]; then 384 if [[ -n "$driver_path" ]]; then 385 insmod $driver_path || true 386 else 387 modprobe $driver_name 388 fi 389 fi 390 391 for bdf in "${!all_devices_d[@]}"; do 392 if ((all_devices_d["$bdf"] == 0)); then 393 if [[ -n ${nvme_d["$bdf"]} ]]; then 394 # Some nvme controllers may take significant amount of time while being 395 # unbound from the driver. Put that task into background to speed up the 396 # whole process. Currently this is done only for the devices bound to the 397 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 398 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 399 linux_bind_driver "$bdf" "$driver_name" & 400 else 401 linux_bind_driver "$bdf" "$driver_name" 402 fi 403 fi 404 done 405 wait 406 407 echo "1" > "/sys/bus/pci/rescan" 408} 409 410function cleanup_linux() { 411 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 412 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 413 414 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 415 if [[ -d $XDG_RUNTIME_DIR ]]; then 416 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 417 fi 418 419 for dir in "${dirs_to_clean[@]}"; do 420 files_to_clean+=("$dir/"*) 421 done 422 file_locks+=(/var/tmp/spdk_pci_lock*) 423 file_locks+=(/var/tmp/spdk_cpu_lock*) 424 425 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 426 files_to_clean+=("${file_locks[@]}") 427 428 # This may fail in case path that readlink attempts to resolve suddenly 429 # disappears (as it may happen with terminating processes). 430 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 431 432 if ((${#opened_files[@]} == 0)); then 433 echo "Can't get list of opened files!" 434 exit 1 435 fi 436 437 echo 'Cleaning' 438 for f in "${files_to_clean[@]}"; do 439 [[ -e $f ]] || continue 440 if [[ ${opened_files[*]} != *"$f"* ]]; then 441 echo "Removing: $f" 442 rm $f 443 else 444 echo "Still open: $f" 445 fi 446 done 447 448 for dir in "${dirs_to_clean[@]}"; do 449 [[ -d $dir ]] || continue 450 if [[ ${opened_files[*]} != *"$dir"* ]]; then 451 echo "Removing: $dir" 452 rmdir $dir 453 else 454 echo "Still open: $dir" 455 fi 456 done 457 echo "Clean" 458} 459 460check_hugepages_alloc() { 461 local hp_int=$1 462 local allocated_hugepages 463 464 allocated_hugepages=$(< "$hp_int") 465 466 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 467 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 468 return 0 469 fi 470 471 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 472 473 allocated_hugepages=$(< "$hp_int") 474 if ((allocated_hugepages < NRHUGE)); then 475 cat <<- ERROR 476 477 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 478 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 479 ERROR 480 return 1 481 fi 482} 483 484clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 485 486configure_linux_hugepages() { 487 local node system_nodes 488 local nodes_to_use nodes_hp 489 490 if [[ $CLEAR_HUGE == yes ]]; then 491 clear_hugepages 492 fi 493 494 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 495 clear_hugepages 496 check_hugepages_alloc /proc/sys/vm/nr_hugepages 497 return 0 498 fi 499 500 for node in /sys/devices/system/node/node*; do 501 [[ -e $node ]] || continue 502 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 503 done 504 505 if ((${#nodes[@]} == 0)); then 506 # No NUMA support? Fallback to common interface 507 check_hugepages_alloc /proc/sys/vm/nr_hugepages 508 return 0 509 fi 510 511 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 512 if ((${#nodes_to_use[@]} == 0)); then 513 nodes_to_use[0]=0 514 fi 515 516 # Align indexes with node ids 517 for node in "${!nodes_to_use[@]}"; do 518 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 519 eval "${nodes_to_use[node]}" 520 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 521 nodes_hp[nodes_to_use[node]]=$NRHUGE 522 fi 523 done 524 525 for node in "${!nodes_hp[@]}"; do 526 if [[ -z ${nodes[node]} ]]; then 527 echo "Node $node doesn't exist, ignoring" >&2 528 continue 529 fi 530 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 531 done 532} 533 534function configure_linux() { 535 configure_linux_pci 536 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 537 538 if [ -z "$hugetlbfs_mounts" ]; then 539 hugetlbfs_mounts=/mnt/huge 540 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 541 mkdir -p "$hugetlbfs_mounts" 542 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 543 fi 544 545 configure_linux_hugepages 546 547 if [ "$driver_name" = "vfio-pci" ]; then 548 if [ -n "$TARGET_USER" ]; then 549 for mount in $hugetlbfs_mounts; do 550 chown "$TARGET_USER" "$mount" 551 chmod g+w "$mount" 552 done 553 554 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 555 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 556 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 557 cat <<- MEMLOCK 558 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 559 560 This is the maximum amount of memory you will be 561 able to use with DPDK and VFIO if run as user "$TARGET_USER". 562 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 563 MEMLOCK 564 if ((MEMLOCK_AMNT < 65536)); then 565 echo "" 566 echo "## WARNING: memlock limit is less than 64MB" 567 echo -n "## DPDK with VFIO may not be able to initialize " 568 echo "if run as user \"$TARGET_USER\"." 569 fi 570 fi 571 fi 572 fi 573 574 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 575 # Some distros build msr as a module. Make sure it's loaded to ensure 576 # DPDK can easily figure out the TSC rate rather than relying on 100ms 577 # sleeps. 578 modprobe msr &> /dev/null || true 579 fi 580} 581 582function reset_linux_pci() { 583 # virtio 584 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 585 # Requires some more investigation - for example, some kernels do not seem to have 586 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 587 # underscore vs. dash right in the virtio_scsi name. 588 modprobe virtio-pci || true 589 for bdf in "${!all_devices_d[@]}"; do 590 ((all_devices_d["$bdf"] == 0)) || continue 591 592 driver=$(collect_driver "$bdf") 593 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 594 linux_bind_driver "$bdf" "$driver" 595 else 596 linux_unbind_driver "$bdf" 597 fi 598 done 599 600 echo "1" > "/sys/bus/pci/rescan" 601} 602 603function reset_linux() { 604 reset_linux_pci 605 for mount in $(linux_hugetlbfs_mounts); do 606 for hp in "$mount"/spdk*map_*; do 607 flock -n "$hp" true && rm -f "$hp" 608 done 609 done 610 rm -f /run/.spdk* 611} 612 613function status_linux() { 614 echo "Hugepages" >&2 615 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 616 617 numa_nodes=0 618 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 619 numa_nodes=$((numa_nodes + 1)) 620 free_pages=$(cat $path/free_hugepages) 621 all_pages=$(cat $path/nr_hugepages) 622 623 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 624 625 node=${BASH_REMATCH[1]} 626 huge_size=${BASH_REMATCH[2]} 627 628 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 629 done 630 631 # fall back to system-wide hugepages 632 if [ "$numa_nodes" = "0" ]; then 633 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 634 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 635 node="-" 636 huge_size="$HUGEPGSZ" 637 638 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 639 fi 640 641 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 642 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 643 644 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 645 646 for bdf in "${sorted_bdfs[@]}"; do 647 driver=${drivers_d["$bdf"]} 648 if [ "$numa_nodes" = "0" ]; then 649 node="-" 650 else 651 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 652 if ((node == -1)); then 653 node=unknown 654 fi 655 fi 656 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 657 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 658 else 659 name="-" 660 fi 661 662 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 663 blknames=($(get_block_dev_from_bdf "$bdf")) 664 else 665 blknames=("-") 666 fi 667 668 desc="" 669 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 670 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 671 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 672 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 673 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 674 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 675 676 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 677 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 678 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 679 done 680} 681 682function status_freebsd() { 683 local pci 684 685 status_print() ( 686 local type=$1 687 local dev driver 688 689 shift 690 691 for pci; do 692 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 693 "$type" \ 694 "$pci" \ 695 "${pci_ids_vendor["$pci"]}" \ 696 "${pci_ids_device["$pci"]}" \ 697 "${pci_bus_driver["$pci"]}" 698 done | sort -k2,2 699 ) 700 701 local contigmem=present 702 local contigmem_buffer_size 703 local contigmem_num_buffers 704 705 if ! kldstat -q -m contigmem; then 706 contigmem="not present" 707 fi 708 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 709 contigmem_buffer_size="not set" 710 fi 711 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 712 contigmem_num_buffers="not set" 713 fi 714 715 cat <<- BSD_INFO 716 Contigmem ($contigmem) 717 Buffer Size: $contigmem_buffer_size 718 Num Buffers: $contigmem_num_buffers 719 720 BSD_INFO 721 722 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 723 "Type" "BDF" "Vendor" "Device" "Driver" >&2 724 725 status_print "NVMe" "${!nvme_d[@]}" 726 status_print "I/OAT" "${!ioat_d[@]}" 727 status_print "DSA" "${!dsa_d[@]}" 728 status_print "IAA" "${!iaa_d[@]}" 729 status_print "VMD" "${!vmd_d[@]}" 730} 731 732function configure_freebsd_pci() { 733 local BDFS 734 735 BDFS+=("${!nvme_d[@]}") 736 BDFS+=("${!ioat_d[@]}") 737 BDFS+=("${!dsa_d[@]}") 738 BDFS+=("${!iaa_d[@]}") 739 BDFS+=("${!vmd_d[@]}") 740 741 # Drop the domain part from all the addresses 742 BDFS=("${BDFS[@]#*:}") 743 744 local IFS="," 745 kldunload nic_uio.ko || true 746 kenv hw.nic_uio.bdfs="${BDFS[*]}" 747 kldload nic_uio.ko 748} 749 750function configure_freebsd() { 751 if ! check_for_driver_freebsd; then 752 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 753 return 1 754 fi 755 configure_freebsd_pci 756 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 757 # previous value, unload contigmem so that we can reload with the new value. 758 if kldstat -q -m contigmem; then 759 # contigmem may be loaded, but the kernel environment doesn't have to 760 # be necessarily set at this point. If it isn't, kenv will fail to 761 # pick up the hw. options. Handle it. 762 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 763 contigmem_num_buffers=-1 764 fi 2> /dev/null 765 if ((contigmem_num_buffers != HUGEMEM / 256)); then 766 kldunload contigmem.ko 767 fi 768 fi 769 if ! kldstat -q -m contigmem; then 770 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 771 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 772 kldload contigmem.ko 773 fi 774} 775 776function reset_freebsd() { 777 kldunload contigmem.ko || true 778 kldunload nic_uio.ko || true 779} 780 781CMD=reset cache_pci_bus 782 783mode=$1 784 785if [ -z "$mode" ]; then 786 mode="config" 787fi 788 789: ${HUGEMEM:=2048} 790: ${PCI_ALLOWED:=""} 791: ${PCI_BLOCKED:=""} 792 793if [ -n "$NVME_ALLOWED" ]; then 794 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 795fi 796 797if [ -n "$SKIP_PCI" ]; then 798 PCI_ALLOWED="none" 799fi 800 801if [ -z "$TARGET_USER" ]; then 802 TARGET_USER="$SUDO_USER" 803 if [ -z "$TARGET_USER" ]; then 804 TARGET_USER=$(logname 2> /dev/null) || true 805 fi 806fi 807 808collect_devices "$mode" 809 810if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 811 # Note that this will wait only for the first block device attached to 812 # a given storage controller. For nvme this may miss some of the devs 813 # in case multiple namespaces are being in place. 814 # FIXME: Wait for nvme controller(s) to be in live state and determine 815 # number of configured namespaces, build list of potential block devs 816 # and pass them to sync_dev_uevents. Is it worth the effort? 817 bdfs_to_wait_for=() 818 for bdf in "${!all_devices_d[@]}"; do 819 ((all_devices_d["$bdf"] == 0)) || continue 820 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 821 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 822 bdfs_to_wait_for+=("$bdf") 823 fi 824 done 825 if ((${#bdfs_to_wait_for[@]} > 0)); then 826 echo "Waiting for block devices as requested" 827 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 828 "$rootdir/scripts/sync_dev_uevents.sh" \ 829 block/disk \ 830 "${bdfs_to_wait_for[@]}" & 831 sync_pid=$! 832 fi 833fi 834 835if [[ $os == Linux ]]; then 836 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 837 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 838 unset -v HUGEPGSZ 839 fi 840 841 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 842 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 843 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 844 845 if [ "$mode" == "config" ]; then 846 configure_linux 847 elif [ "$mode" == "cleanup" ]; then 848 cleanup_linux 849 clear_hugepages 850 elif [ "$mode" == "reset" ]; then 851 reset_linux 852 elif [ "$mode" == "status" ]; then 853 status_linux 854 elif [ "$mode" == "help" ]; then 855 usage $0 856 else 857 usage $0 "Invalid argument '$mode'" 858 fi 859else 860 if [ "$mode" == "config" ]; then 861 configure_freebsd 862 elif [ "$mode" == "reset" ]; then 863 reset_freebsd 864 elif [ "$mode" == "cleanup" ]; then 865 echo "setup.sh cleanup function not yet supported on $os" 866 elif [ "$mode" == "status" ]; then 867 status_freebsd 868 elif [ "$mode" == "help" ]; then 869 usage $0 870 else 871 usage $0 "Invalid argument '$mode'" 872 fi 873fi 874 875if [[ -e /proc/$sync_pid/status ]]; then 876 wait "$sync_pid" 877fi 878