1#!/usr/bin/env bash 2 3set -e 4 5os=$(uname -s) 6 7if [[ $os != Linux && $os != FreeBSD ]]; then 8 echo "Not supported platform ($os), aborting" 9 exit 1 10fi 11 12rootdir=$(readlink -f $(dirname $0))/.. 13source "$rootdir/scripts/common.sh" 14 15function usage() { 16 if [[ $os == Linux ]]; then 17 options="[config|reset|status|cleanup|help]" 18 else 19 options="[config|reset|help]" 20 fi 21 22 [[ -n $2 ]] && ( 23 echo "$2" 24 echo "" 25 ) 26 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 27 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 28 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 29 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 30 echo "Usage: $(basename $1) $options" 31 echo 32 echo "$options - as following:" 33 echo "config Default mode. Allocate hugepages and bind PCI devices." 34 if [[ $os == Linux ]]; then 35 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 36 fi 37 echo "reset Rebind PCI devices back to their original drivers." 38 echo " Also cleanup any leftover spdk files/resources." 39 echo " Hugepage memory size will remain unchanged." 40 if [[ $os == Linux ]]; then 41 echo "status Print status of all SPDK-compatible devices on the system." 42 fi 43 echo "help Print this help message." 44 echo 45 echo "The following environment variables can be specified." 46 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 47 echo " For NUMA systems, the hugepages will be distributed on node0 by" 48 echo " default." 49 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 50 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 51 echo " Uses kernel's default for hugepages size." 52 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 53 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 54 echo " separated with comas. By default, NRHUGE will be applied on each node." 55 echo " Hugepages can be defined per node with e.g.:" 56 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 57 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 58 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 59 echo " setting is used." 60 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 61 echo " be made prior to allocation". 62 echo "PCI_ALLOWED" 63 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 64 echo " Each device must be specified as a full PCI address." 65 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 66 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 67 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 68 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 69 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 70 echo " will be bound." 71 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 72 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 73 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 74 echo " By default the current user will be used." 75 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 76 echo " bind devices to the given driver." 77 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 78 echo "PCI_BLOCK_SYNC_ON_RESET" 79 echo " If set in the environment, the attempt to wait for block devices associated" 80 echo " with given PCI device will be made upon reset" 81 exit 0 82} 83 84# In monolithic kernels the lsmod won't work. So 85# back that with a /sys/modules. We also check 86# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 87# contain needed info (like in Fedora-like OS). 88function check_for_driver() { 89 if [[ -z $1 ]]; then 90 return 0 91 fi 92 93 if lsmod | grep -q ${1//-/_}; then 94 return 1 95 fi 96 97 if [[ -d /sys/module/${1} || -d \ 98 /sys/module/${1//-/_} || -d \ 99 /sys/bus/pci/drivers/${1} || -d \ 100 /sys/bus/pci/drivers/${1//-/_} ]]; then 101 return 2 102 fi 103 return 0 104} 105 106function check_for_driver_freebsd() { 107 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 108 local search_paths path driver 109 IFS=";" read -ra search_paths < <(kldconfig -rU) 110 111 for driver in contigmem.ko nic_uio.ko; do 112 for path in "${search_paths[@]}"; do 113 [[ -f $path/$driver ]] && continue 2 114 done 115 return 1 116 done 117 return 0 118} 119 120function pci_dev_echo() { 121 local bdf="$1" 122 shift 123 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 124} 125 126function linux_bind_driver() { 127 bdf="$1" 128 driver_name="$2" 129 old_driver_name=${drivers_d["$bdf"]:-no driver} 130 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 131 132 if [[ $driver_name == "$old_driver_name" ]]; then 133 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 134 return 0 135 fi 136 137 if [[ $old_driver_name != "no driver" ]]; then 138 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 139 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 140 fi 141 142 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 143 144 if [[ $driver_name == "none" ]]; then 145 return 0 146 fi 147 148 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 149 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 150 151 if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then 152 # Check if the uio_pci_generic driver is broken as it might be in 153 # some 4.18.x kernels (see centos8 for instance) - if our device 154 # didn't get a proper uio entry, fallback to igb_uio 155 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 156 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 157 drivers_d["$bdf"]="no driver" 158 # This call will override $driver_name for remaining devices as well 159 linux_bind_driver "$bdf" igb_uio 160 fi 161 fi 162 163 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 164 if [ -e "/dev/vfio/$iommu_group" ]; then 165 if [ -n "$TARGET_USER" ]; then 166 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 167 fi 168 fi 169} 170 171function linux_unbind_driver() { 172 local bdf="$1" 173 local ven_dev_id 174 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 175 local old_driver_name=${drivers_d["$bdf"]:-no driver} 176 177 if [[ $old_driver_name == "no driver" ]]; then 178 pci_dev_echo "$bdf" "Not bound to any driver" 179 return 0 180 fi 181 182 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 183 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 184 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 185 fi 186 187 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 188} 189 190function linux_hugetlbfs_mounts() { 191 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 192} 193 194function get_block_dev_from_bdf() { 195 local bdf=$1 196 local block 197 198 for block in /sys/block/*; do 199 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 200 echo "${block##*/}" 201 fi 202 done 203} 204 205function get_used_bdf_block_devs() { 206 local bdf=$1 207 local blocks block blockp dev mount holder 208 local used 209 210 hash lsblk || return 1 211 blocks=($(get_block_dev_from_bdf "$bdf")) 212 213 for block in "${blocks[@]}"; do 214 # Check if the device is hold by some other, regardless if it's mounted 215 # or not. 216 for holder in "/sys/class/block/$block"*/holders/*; do 217 [[ -e $holder ]] || continue 218 blockp=${holder%/holders*} blockp=${blockp##*/} 219 if [[ -e $holder/slaves/$blockp ]]; then 220 used+=("holder@$blockp:${holder##*/}") 221 fi 222 done 223 while read -r dev mount; do 224 if [[ -e $mount ]]; then 225 used+=("mount@$block:$dev") 226 fi 227 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 228 if ((${#used[@]} == 0)); then 229 # Make sure we check if there's any valid data present on the target device 230 # regardless if it's being actively used or not. This is mainly done to make 231 # sure we don't miss more complex setups like ZFS pools, etc. 232 if block_in_use "$block" > /dev/null; then 233 used+=("data@$block") 234 fi 235 fi 236 done 237 238 if ((${#used[@]} > 0)); then 239 printf '%s\n' "${used[@]}" 240 fi 241} 242 243function collect_devices() { 244 # NVMe, IOAT, IDXD, VIRTIO, VMD 245 246 local ids dev_type dev_id bdf bdfs in_use driver 247 248 ids+="PCI_DEVICE_ID_INTEL_IOAT" 249 ids+="|PCI_DEVICE_ID_INTEL_IDXD" 250 ids+="|PCI_DEVICE_ID_VIRTIO" 251 ids+="|PCI_DEVICE_ID_INTEL_VMD" 252 ids+="|SPDK_PCI_CLASS_NVME" 253 254 local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d 255 256 while read -r _ dev_type dev_id; do 257 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 258 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 259 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 260 [[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 261 for bdf in "${bdfs[@]}"; do 262 in_use=0 263 if [[ $1 != status ]]; then 264 if ! pci_can_use "$bdf"; then 265 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 266 in_use=1 267 fi 268 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 269 if ! verify_bdf_block_devs "$bdf"; then 270 in_use=1 271 fi 272 fi 273 if [[ $dev_type == vmd ]]; then 274 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 275 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 276 in_use=1 277 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 278 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 279 if [ "$mode" == "config" ]; then 280 cat <<- MESSAGE 281 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 282 which are attached to the kernel NVMe driver,the binding process may go faster 283 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 284 NVMe SSDs, and then run again to unbind the VMD devices." 285 MESSAGE 286 fi 287 fi 288 fi 289 fi 290 fi 291 eval "${dev_type}_d[$bdf]=$in_use" 292 all_devices_d["$bdf"]=$in_use 293 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 294 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 295 drivers_d["$bdf"]=${driver##*/} 296 fi 297 done 298 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 299} 300 301function collect_driver() { 302 local bdf=$1 303 local drivers driver 304 305 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 306 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 307 # Pick first entry in case multiple aliases are bound to a driver. 308 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 309 driver=${driver##*/} 310 else 311 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 312 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 313 [[ -n ${idxd_d["$bdf"]} ]] && driver=idxd 314 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 315 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 316 fi 2> /dev/null 317 echo "$driver" 318} 319 320function verify_bdf_block_devs() { 321 local bdf=$1 322 local blknames 323 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 324 325 if ((${#blknames[@]} > 0)); then 326 local IFS="," 327 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 328 return 1 329 fi 330} 331 332function configure_linux_pci() { 333 local driver_path="" 334 driver_name="" 335 igb_uio_fallback="" 336 337 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 338 # igb_uio is a common driver to override with and it depends on uio. 339 modprobe uio || true 340 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 341 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 342 fi 343 fi 344 345 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 346 driver_name=none 347 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 348 driver_path="$DRIVER_OVERRIDE" 349 driver_name="${DRIVER_OVERRIDE##*/}" 350 # modprobe and the sysfs don't use the .ko suffix. 351 driver_name=${driver_name%.ko} 352 # path = name -> there is no path 353 if [[ "$driver_path" = "$driver_name" ]]; then 354 driver_path="" 355 fi 356 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 357 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 358 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 359 driver_name=vfio-pci 360 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 361 # should be done automatically by modprobe since this particular module should 362 # be a part of vfio-pci dependencies, however, on some distros, it seems that 363 # it's not the case. See #1689. 364 if modinfo vfio_iommu_type1 > /dev/null; then 365 modprobe vfio_iommu_type1 366 fi 367 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 368 driver_name=uio_pci_generic 369 elif [[ -e $igb_uio_fallback ]]; then 370 driver_path="$igb_uio_fallback" 371 driver_name="igb_uio" 372 echo "WARNING: uio_pci_generic not detected - using $driver_name" 373 else 374 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 375 return 1 376 fi 377 378 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 379 if [[ $driver_name != "none" ]]; then 380 if [[ -n "$driver_path" ]]; then 381 insmod $driver_path || true 382 else 383 modprobe $driver_name 384 fi 385 fi 386 387 for bdf in "${!all_devices_d[@]}"; do 388 if ((all_devices_d["$bdf"] == 0)); then 389 if [[ -n ${nvme_d["$bdf"]} ]]; then 390 # Some nvme controllers may take significant amount of time while being 391 # unbound from the driver. Put that task into background to speed up the 392 # whole process. Currently this is done only for the devices bound to the 393 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 394 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 395 linux_bind_driver "$bdf" "$driver_name" & 396 else 397 linux_bind_driver "$bdf" "$driver_name" 398 fi 399 fi 400 done 401 wait 402 403 echo "1" > "/sys/bus/pci/rescan" 404} 405 406function cleanup_linux() { 407 shopt -s extglob nullglob 408 dirs_to_clean="" 409 dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) " 410 if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then 411 dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) " 412 fi 413 414 files_to_clean="" file_locks=() 415 for dir in $dirs_to_clean; do 416 files_to_clean+="$(echo $dir/*) " 417 done 418 file_locks+=(/var/tmp/spdk_pci_lock*) 419 shopt -u extglob nullglob 420 421 files_to_clean+="$(ls -1 /dev/shm/* \ 422 | grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) " 423 files_to_clean+=" ${file_locks[*]}" 424 files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)" 425 if [[ -z "$files_to_clean" ]]; then 426 echo "Clean" 427 return 0 428 fi 429 430 shopt -s extglob 431 for fd_dir in $(echo /proc/+([0-9])); do 432 opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)" 433 done 434 shopt -u extglob 435 436 if [[ -z "$opened_files" ]]; then 437 echo "Can't get list of opened files!" 438 exit 1 439 fi 440 441 echo 'Cleaning' 442 for f in $files_to_clean; do 443 if ! echo "$opened_files" | grep -E -q "^$f\$"; then 444 echo "Removing: $f" 445 rm $f 446 else 447 echo "Still open: $f" 448 fi 449 done 450 451 for dir in $dirs_to_clean; do 452 if ! echo "$opened_files" | grep -E -q "^$dir\$"; then 453 echo "Removing: $dir" 454 rmdir $dir 455 else 456 echo "Still open: $dir" 457 fi 458 done 459 echo "Clean" 460 461 unset dirs_to_clean files_to_clean opened_files 462} 463 464check_hugepages_alloc() { 465 local hp_int=$1 466 local allocated_hugepages 467 468 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 469 470 allocated_hugepages=$(< "$hp_int") 471 if ((allocated_hugepages < NRHUGE)); then 472 cat <<- ERROR 473 474 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 475 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 476 ERROR 477 return 1 478 fi 479} 480 481clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 482 483configure_linux_hugepages() { 484 local node system_nodes 485 local nodes_to_use nodes_hp 486 487 if [[ $CLEAR_HUGE == yes ]]; then 488 clear_hugepages 489 fi 490 491 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 492 clear_hugepages 493 check_hugepages_alloc /proc/sys/vm/nr_hugepages 494 return 0 495 fi 496 497 for node in /sys/devices/system/node/node*; do 498 [[ -e $node ]] || continue 499 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 500 done 501 502 if ((${#nodes[@]} == 0)); then 503 # No NUMA support? Fallback to common interface 504 check_hugepages_alloc /proc/sys/vm/nr_hugepages 505 return 0 506 fi 507 508 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 509 if ((${#nodes_to_use[@]} == 0)); then 510 nodes_to_use[0]=0 511 fi 512 513 # Align indexes with node ids 514 for node in "${!nodes_to_use[@]}"; do 515 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 516 eval "${nodes_to_use[node]}" 517 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 518 nodes_hp[nodes_to_use[node]]=$NRHUGE 519 fi 520 done 521 522 for node in "${!nodes_hp[@]}"; do 523 if [[ -z ${nodes[node]} ]]; then 524 echo "Node $node doesn't exist, ignoring" >&2 525 continue 526 fi 527 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 528 done 529} 530 531function configure_linux() { 532 configure_linux_pci 533 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 534 535 if [ -z "$hugetlbfs_mounts" ]; then 536 hugetlbfs_mounts=/mnt/huge 537 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 538 mkdir -p "$hugetlbfs_mounts" 539 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 540 fi 541 542 configure_linux_hugepages 543 544 if [ "$driver_name" = "vfio-pci" ]; then 545 if [ -n "$TARGET_USER" ]; then 546 for mount in $hugetlbfs_mounts; do 547 chown "$TARGET_USER" "$mount" 548 chmod g+w "$mount" 549 done 550 551 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 552 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 553 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 554 cat <<- MEMLOCK 555 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 556 557 This is the maximum amount of memory you will be 558 able to use with DPDK and VFIO if run as user "$TARGET_USER". 559 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 560 MEMLOCK 561 if ((MEMLOCK_AMNT < 65536)); then 562 echo "" 563 echo "## WARNING: memlock limit is less than 64MB" 564 echo -n "## DPDK with VFIO may not be able to initialize " 565 echo "if run as user \"$TARGET_USER\"." 566 fi 567 fi 568 fi 569 fi 570 571 if [ ! -e /dev/cpu/0/msr ]; then 572 # Some distros build msr as a module. Make sure it's loaded to ensure 573 # DPDK can easily figure out the TSC rate rather than relying on 100ms 574 # sleeps. 575 modprobe msr || true 576 fi 577} 578 579function reset_linux_pci() { 580 # virtio 581 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 582 # Requires some more investigation - for example, some kernels do not seem to have 583 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 584 # underscore vs. dash right in the virtio_scsi name. 585 modprobe virtio-pci || true 586 for bdf in "${!all_devices_d[@]}"; do 587 ((all_devices_d["$bdf"] == 0)) || continue 588 589 driver=$(collect_driver "$bdf") 590 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 591 linux_bind_driver "$bdf" "$driver" 592 else 593 linux_unbind_driver "$bdf" 594 fi 595 done 596 597 echo "1" > "/sys/bus/pci/rescan" 598} 599 600function reset_linux() { 601 reset_linux_pci 602 for mount in $(linux_hugetlbfs_mounts); do 603 for hp in "$mount"/spdk*map_*; do 604 flock -n "$hp" true && rm -f "$hp" 605 done 606 done 607 rm -f /run/.spdk* 608} 609 610function status_linux() { 611 echo "Hugepages" >&2 612 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 613 614 numa_nodes=0 615 shopt -s nullglob 616 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 617 numa_nodes=$((numa_nodes + 1)) 618 free_pages=$(cat $path/free_hugepages) 619 all_pages=$(cat $path/nr_hugepages) 620 621 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 622 623 node=${BASH_REMATCH[1]} 624 huge_size=${BASH_REMATCH[2]} 625 626 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 627 done 628 shopt -u nullglob 629 630 # fall back to system-wide hugepages 631 if [ "$numa_nodes" = "0" ]; then 632 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 633 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 634 node="-" 635 huge_size="$HUGEPGSZ" 636 637 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 638 fi 639 640 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 641 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 642 643 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 644 645 for bdf in "${sorted_bdfs[@]}"; do 646 driver=${drivers_d["$bdf"]} 647 if [ "$numa_nodes" = "0" ]; then 648 node="-" 649 else 650 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 651 if ((node == -1)); then 652 node=unknown 653 fi 654 fi 655 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 656 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 657 else 658 name="-" 659 fi 660 661 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 662 blknames=($(get_block_dev_from_bdf "$bdf")) 663 else 664 blknames=("-") 665 fi 666 667 desc="" 668 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 669 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 670 desc=${desc:-${idxd_d["$bdf"]:+IDXD}} 671 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 672 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 673 674 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 675 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 676 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 677 done 678} 679 680function status_freebsd() { 681 local pci 682 683 status_print() ( 684 local dev driver 685 686 echo -e "BDF\t\tVendor\tDevice\tDriver" 687 688 for pci; do 689 driver=$(pciconf -l "pci$pci") 690 driver=${driver%@*} 691 printf '%s\t%s\t%s\t%s\n' \ 692 "$pci" \ 693 "${pci_ids_vendor["$pci"]}" \ 694 "${pci_ids_device["$pci"]}" \ 695 "$driver" 696 done | sort -k1,1 697 ) 698 699 local contigmem=present 700 local contigmem_buffer_size 701 local contigmem_num_buffers 702 703 if ! kldstat -q -m contigmem; then 704 contigmem="not present" 705 fi 706 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 707 contigmem_buffer_size="not set" 708 fi 709 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 710 contigmem_num_buffers="not set" 711 fi 712 713 cat <<- BSD_INFO 714 Contigmem ($contigmem) 715 Buffer Size: $contigmem_buffer_size 716 Num Buffers: $contigmem_num_buffers 717 718 NVMe devices 719 $(status_print "${!nvme_d[@]}") 720 721 I/IOAT DMA 722 $(status_print "${!ioat_d[@]}") 723 724 IDXD DMA 725 $(status_print "${!idxd_d[@]}") 726 727 VMD 728 $(status_print "${!vmd_d[@]}") 729 BSD_INFO 730} 731 732function configure_freebsd_pci() { 733 local BDFS 734 735 BDFS+=("${!nvme_d[@]}") 736 BDFS+=("${!ioat_d[@]}") 737 BDFS+=("${!idxd_d[@]}") 738 BDFS+=("${!vmd_d[@]}") 739 740 # Drop the domain part from all the addresses 741 BDFS=("${BDFS[@]#*:}") 742 743 local IFS="," 744 kldunload nic_uio.ko || true 745 kenv hw.nic_uio.bdfs="${BDFS[*]}" 746 kldload nic_uio.ko 747} 748 749function configure_freebsd() { 750 if ! check_for_driver_freebsd; then 751 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 752 return 1 753 fi 754 configure_freebsd_pci 755 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 756 # previous value, unload contigmem so that we can reload with the new value. 757 if kldstat -q -m contigmem; then 758 # contigmem may be loaded, but the kernel environment doesn't have to 759 # be necessarily set at this point. If it isn't, kenv will fail to 760 # pick up the hw. options. Handle it. 761 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 762 contigmem_num_buffers=-1 763 fi 2> /dev/null 764 if ((contigmem_num_buffers != HUGEMEM / 256)); then 765 kldunload contigmem.ko 766 fi 767 fi 768 if ! kldstat -q -m contigmem; then 769 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 770 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 771 kldload contigmem.ko 772 fi 773} 774 775function reset_freebsd() { 776 kldunload contigmem.ko || true 777 kldunload nic_uio.ko || true 778} 779 780CMD=reset cache_pci_bus 781 782mode=$1 783 784if [ -z "$mode" ]; then 785 mode="config" 786fi 787 788: ${HUGEMEM:=2048} 789: ${PCI_ALLOWED:=""} 790: ${PCI_BLOCKED:=""} 791 792if [ -n "$NVME_ALLOWED" ]; then 793 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 794fi 795 796if [ -n "$SKIP_PCI" ]; then 797 PCI_ALLOWED="none" 798fi 799 800if [ -z "$TARGET_USER" ]; then 801 TARGET_USER="$SUDO_USER" 802 if [ -z "$TARGET_USER" ]; then 803 TARGET_USER=$(logname 2> /dev/null) || true 804 fi 805fi 806 807collect_devices "$mode" 808 809if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 810 # Note that this will wait only for the first block device attached to 811 # a given storage controller. For nvme this may miss some of the devs 812 # in case multiple namespaces are being in place. 813 # FIXME: Wait for nvme controller(s) to be in live state and determine 814 # number of configured namespaces, build list of potential block devs 815 # and pass them to sync_dev_uevents. Is it worth the effort? 816 bdfs_to_wait_for=() 817 for bdf in "${!all_devices_d[@]}"; do 818 ((all_devices_d["$bdf"] == 0)) || continue 819 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 820 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 821 bdfs_to_wait_for+=("$bdf") 822 fi 823 done 824 if ((${#bdfs_to_wait_for[@]} > 0)); then 825 echo "Waiting for block devices as requested" 826 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 827 "$rootdir/scripts/sync_dev_uevents.sh" \ 828 block/disk \ 829 "${bdfs_to_wait_for[@]}" & 830 sync_pid=$! 831 fi 832fi 833 834if [[ $os == Linux ]]; then 835 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 836 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 837 unset -v HUGEPGSZ 838 fi 839 840 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 841 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 842 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 843 844 if [ "$mode" == "config" ]; then 845 configure_linux 846 elif [ "$mode" == "cleanup" ]; then 847 cleanup_linux 848 clear_hugepages 849 elif [ "$mode" == "reset" ]; then 850 reset_linux 851 elif [ "$mode" == "status" ]; then 852 status_linux 853 elif [ "$mode" == "help" ]; then 854 usage $0 855 else 856 usage $0 "Invalid argument '$mode'" 857 fi 858else 859 if [ "$mode" == "config" ]; then 860 configure_freebsd 861 elif [ "$mode" == "reset" ]; then 862 reset_freebsd 863 elif [ "$mode" == "cleanup" ]; then 864 echo "setup.sh cleanup function not yet supported on $os" 865 elif [ "$mode" == "status" ]; then 866 status_freebsd 867 elif [ "$mode" == "help" ]; then 868 usage $0 869 else 870 usage $0 "Invalid argument '$mode'" 871 fi 872fi 873 874if [[ -e /proc/$sync_pid/status ]]; then 875 wait "$sync_pid" 876fi 877