1#!/usr/bin/env bash 2 3set -e 4 5os=$(uname -s) 6 7if [[ $os != Linux && $os != FreeBSD ]]; then 8 echo "Not supported platform ($os), aborting" 9 exit 1 10fi 11 12rootdir=$(readlink -f $(dirname $0))/.. 13source "$rootdir/scripts/common.sh" 14 15function usage() { 16 if [[ $os == Linux ]]; then 17 options="[config|reset|status|cleanup|help]" 18 else 19 options="[config|reset|help]" 20 fi 21 22 [[ -n $2 ]] && ( 23 echo "$2" 24 echo "" 25 ) 26 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 27 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 28 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 29 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 30 echo "Usage: $(basename $1) $options" 31 echo 32 echo "$options - as following:" 33 echo "config Default mode. Allocate hugepages and bind PCI devices." 34 if [[ $os == Linux ]]; then 35 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 36 fi 37 echo "reset Rebind PCI devices back to their original drivers." 38 echo " Also cleanup any leftover spdk files/resources." 39 echo " Hugepage memory size will remain unchanged." 40 if [[ $os == Linux ]]; then 41 echo "status Print status of all SPDK-compatible devices on the system." 42 fi 43 echo "help Print this help message." 44 echo 45 echo "The following environment variables can be specified." 46 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 47 echo " For NUMA systems, the hugepages will be distributed on node0 by" 48 echo " default." 49 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 50 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 51 echo " Uses kernel's default for hugepages size." 52 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 53 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 54 echo " separated with comas. By default, NRHUGE will be applied on each node." 55 echo " Hugepages can be defined per node with e.g.:" 56 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 57 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 58 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 59 echo " setting is used." 60 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 61 echo " be made prior to allocation". 62 echo "PCI_ALLOWED" 63 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 64 echo " Each device must be specified as a full PCI address." 65 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 66 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 67 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 68 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 69 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 70 echo " will be bound." 71 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 72 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 73 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 74 echo " By default the current user will be used." 75 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 76 echo " bind devices to the given driver." 77 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 78 echo "PCI_BLOCK_SYNC_ON_RESET" 79 echo " If set in the environment, the attempt to wait for block devices associated" 80 echo " with given PCI device will be made upon reset" 81 exit 0 82} 83 84# In monolithic kernels the lsmod won't work. So 85# back that with a /sys/modules. We also check 86# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 87# contain needed info (like in Fedora-like OS). 88function check_for_driver() { 89 if [[ -z $1 ]]; then 90 return 0 91 fi 92 93 if lsmod | grep -q ${1//-/_}; then 94 return 1 95 fi 96 97 if [[ -d /sys/module/${1} || -d \ 98 /sys/module/${1//-/_} || -d \ 99 /sys/bus/pci/drivers/${1} || -d \ 100 /sys/bus/pci/drivers/${1//-/_} ]]; then 101 return 2 102 fi 103 return 0 104} 105 106function check_for_driver_freebsd() { 107 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 108 local search_paths path driver 109 IFS=";" read -ra search_paths < <(kldconfig -rU) 110 111 for driver in contigmem.ko nic_uio.ko; do 112 for path in "${search_paths[@]}"; do 113 [[ -f $path/$driver ]] && continue 2 114 done 115 return 1 116 done 117 return 0 118} 119 120function pci_dev_echo() { 121 local bdf="$1" 122 shift 123 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 124} 125 126function linux_bind_driver() { 127 bdf="$1" 128 driver_name="$2" 129 old_driver_name=${drivers_d["$bdf"]:-no driver} 130 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 131 132 if [[ $driver_name == "$old_driver_name" ]]; then 133 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 134 return 0 135 fi 136 137 if [[ $old_driver_name != "no driver" ]]; then 138 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 139 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 140 fi 141 142 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 143 144 if [[ $driver_name == "none" ]]; then 145 return 0 146 fi 147 148 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 149 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 150 151 if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then 152 # Check if the uio_pci_generic driver is broken as it might be in 153 # some 4.18.x kernels (see centos8 for instance) - if our device 154 # didn't get a proper uio entry, fallback to igb_uio 155 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 156 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 157 drivers_d["$bdf"]="no driver" 158 # This call will override $driver_name for remaining devices as well 159 linux_bind_driver "$bdf" igb_uio 160 fi 161 fi 162 163 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 164 if [ -e "/dev/vfio/$iommu_group" ]; then 165 if [ -n "$TARGET_USER" ]; then 166 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 167 fi 168 fi 169} 170 171function linux_unbind_driver() { 172 local bdf="$1" 173 local ven_dev_id 174 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 175 local old_driver_name=${drivers_d["$bdf"]:-no driver} 176 177 if [[ $old_driver_name == "no driver" ]]; then 178 pci_dev_echo "$bdf" "Not bound to any driver" 179 return 0 180 fi 181 182 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 183 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 184 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 185 fi 186 187 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 188} 189 190function linux_hugetlbfs_mounts() { 191 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 192} 193 194function get_block_dev_from_bdf() { 195 local bdf=$1 196 local block 197 198 for block in /sys/block/*; do 199 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 200 echo "${block##*/}" 201 fi 202 done 203} 204 205function get_used_bdf_block_devs() { 206 local bdf=$1 207 local blocks block blockp dev mount holder 208 local used 209 210 hash lsblk &> /dev/null || return 1 211 blocks=($(get_block_dev_from_bdf "$bdf")) 212 213 for block in "${blocks[@]}"; do 214 # Check if the device is hold by some other, regardless if it's mounted 215 # or not. 216 for holder in "/sys/class/block/$block"*/holders/*; do 217 [[ -e $holder ]] || continue 218 blockp=${holder%/holders*} blockp=${blockp##*/} 219 if [[ -e $holder/slaves/$blockp ]]; then 220 used+=("holder@$blockp:${holder##*/}") 221 fi 222 done 223 while read -r dev mount; do 224 if [[ -e $mount ]]; then 225 used+=("mount@$block:$dev") 226 fi 227 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 228 if ((${#used[@]} == 0)); then 229 # Make sure we check if there's any valid data present on the target device 230 # regardless if it's being actively used or not. This is mainly done to make 231 # sure we don't miss more complex setups like ZFS pools, etc. 232 if block_in_use "$block" > /dev/null; then 233 used+=("data@$block") 234 fi 235 fi 236 done 237 238 if ((${#used[@]} > 0)); then 239 printf '%s\n' "${used[@]}" 240 fi 241} 242 243function collect_devices() { 244 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 245 246 local ids dev_type dev_id bdf bdfs in_use driver 247 248 ids+="PCI_DEVICE_ID_INTEL_IOAT" 249 ids+="|PCI_DEVICE_ID_INTEL_DSA" 250 ids+="|PCI_DEVICE_ID_INTEL_IAA" 251 ids+="|PCI_DEVICE_ID_VIRTIO" 252 ids+="|PCI_DEVICE_ID_INTEL_VMD" 253 ids+="|SPDK_PCI_CLASS_NVME" 254 255 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d 256 257 while read -r _ dev_type dev_id; do 258 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 259 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 260 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 261 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 262 for bdf in "${bdfs[@]}"; do 263 in_use=0 264 if [[ $1 != status ]]; then 265 if ! pci_can_use "$bdf"; then 266 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 267 in_use=1 268 fi 269 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 270 if ! verify_bdf_block_devs "$bdf"; then 271 in_use=1 272 fi 273 fi 274 if [[ $dev_type == vmd ]]; then 275 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 276 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 277 in_use=1 278 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 279 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 280 if [ "$mode" == "config" ]; then 281 cat <<- MESSAGE 282 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 283 which are attached to the kernel NVMe driver,the binding process may go faster 284 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 285 NVMe SSDs, and then run again to unbind the VMD devices." 286 MESSAGE 287 fi 288 fi 289 fi 290 fi 291 fi 292 eval "${dev_type}_d[$bdf]=$in_use" 293 all_devices_d["$bdf"]=$in_use 294 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 295 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 296 drivers_d["$bdf"]=${driver##*/} 297 fi 298 done 299 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 300} 301 302function collect_driver() { 303 local bdf=$1 304 local drivers driver 305 306 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 307 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 308 # Pick first entry in case multiple aliases are bound to a driver. 309 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 310 driver=${driver##*/} 311 else 312 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 313 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 314 [[ -n ${dsa_d["$bdf"]} ]] && driver=dsa 315 [[ -n ${iaa_d["$bdf"]} ]] && driver=iaa 316 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 317 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 318 fi 2> /dev/null 319 echo "$driver" 320} 321 322function verify_bdf_block_devs() { 323 local bdf=$1 324 local blknames 325 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 326 327 if ((${#blknames[@]} > 0)); then 328 local IFS="," 329 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 330 return 1 331 fi 332} 333 334function configure_linux_pci() { 335 local driver_path="" 336 driver_name="" 337 igb_uio_fallback="" 338 339 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 340 # igb_uio is a common driver to override with and it depends on uio. 341 modprobe uio || true 342 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 343 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 344 fi 345 fi 346 347 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 348 driver_name=none 349 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 350 driver_path="$DRIVER_OVERRIDE" 351 driver_name="${DRIVER_OVERRIDE##*/}" 352 # modprobe and the sysfs don't use the .ko suffix. 353 driver_name=${driver_name%.ko} 354 # path = name -> there is no path 355 if [[ "$driver_path" = "$driver_name" ]]; then 356 driver_path="" 357 fi 358 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 359 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 360 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 361 driver_name=vfio-pci 362 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 363 # should be done automatically by modprobe since this particular module should 364 # be a part of vfio-pci dependencies, however, on some distros, it seems that 365 # it's not the case. See #1689. 366 if modinfo vfio_iommu_type1 > /dev/null; then 367 modprobe vfio_iommu_type1 368 fi 369 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 370 driver_name=uio_pci_generic 371 elif [[ -e $igb_uio_fallback ]]; then 372 driver_path="$igb_uio_fallback" 373 driver_name="igb_uio" 374 echo "WARNING: uio_pci_generic not detected - using $driver_name" 375 else 376 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 377 return 1 378 fi 379 380 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 381 if [[ $driver_name != "none" ]]; then 382 if [[ -n "$driver_path" ]]; then 383 insmod $driver_path || true 384 else 385 modprobe $driver_name 386 fi 387 fi 388 389 for bdf in "${!all_devices_d[@]}"; do 390 if ((all_devices_d["$bdf"] == 0)); then 391 if [[ -n ${nvme_d["$bdf"]} ]]; then 392 # Some nvme controllers may take significant amount of time while being 393 # unbound from the driver. Put that task into background to speed up the 394 # whole process. Currently this is done only for the devices bound to the 395 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 396 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 397 linux_bind_driver "$bdf" "$driver_name" & 398 else 399 linux_bind_driver "$bdf" "$driver_name" 400 fi 401 fi 402 done 403 wait 404 405 echo "1" > "/sys/bus/pci/rescan" 406} 407 408function cleanup_linux() { 409 shopt -s extglob nullglob 410 dirs_to_clean="" 411 dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) " 412 if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then 413 dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) " 414 fi 415 416 files_to_clean="" file_locks=() 417 for dir in $dirs_to_clean; do 418 files_to_clean+="$(echo $dir/*) " 419 done 420 file_locks+=(/var/tmp/spdk_pci_lock*) 421 shopt -u extglob nullglob 422 423 files_to_clean+="$(ls -1 /dev/shm/* \ 424 | grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc)_trace|spdk_iscsi_conns' || true) " 425 files_to_clean+=" ${file_locks[*]}" 426 files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)" 427 if [[ -z "$files_to_clean" ]]; then 428 echo "Clean" 429 return 0 430 fi 431 432 shopt -s extglob 433 for fd_dir in $(echo /proc/+([0-9])); do 434 opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)" 435 done 436 shopt -u extglob 437 438 if [[ -z "$opened_files" ]]; then 439 echo "Can't get list of opened files!" 440 exit 1 441 fi 442 443 echo 'Cleaning' 444 for f in $files_to_clean; do 445 if ! echo "$opened_files" | grep -E -q "^$f\$"; then 446 echo "Removing: $f" 447 rm $f 448 else 449 echo "Still open: $f" 450 fi 451 done 452 453 for dir in $dirs_to_clean; do 454 if ! echo "$opened_files" | grep -E -q "^$dir\$"; then 455 echo "Removing: $dir" 456 rmdir $dir 457 else 458 echo "Still open: $dir" 459 fi 460 done 461 echo "Clean" 462 463 unset dirs_to_clean files_to_clean opened_files 464} 465 466check_hugepages_alloc() { 467 local hp_int=$1 468 local allocated_hugepages 469 470 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 471 472 allocated_hugepages=$(< "$hp_int") 473 if ((allocated_hugepages < NRHUGE)); then 474 cat <<- ERROR 475 476 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 477 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 478 ERROR 479 return 1 480 fi 481} 482 483clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 484 485configure_linux_hugepages() { 486 local node system_nodes 487 local nodes_to_use nodes_hp 488 489 if [[ $CLEAR_HUGE == yes ]]; then 490 clear_hugepages 491 fi 492 493 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 494 clear_hugepages 495 check_hugepages_alloc /proc/sys/vm/nr_hugepages 496 return 0 497 fi 498 499 for node in /sys/devices/system/node/node*; do 500 [[ -e $node ]] || continue 501 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 502 done 503 504 if ((${#nodes[@]} == 0)); then 505 # No NUMA support? Fallback to common interface 506 check_hugepages_alloc /proc/sys/vm/nr_hugepages 507 return 0 508 fi 509 510 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 511 if ((${#nodes_to_use[@]} == 0)); then 512 nodes_to_use[0]=0 513 fi 514 515 # Align indexes with node ids 516 for node in "${!nodes_to_use[@]}"; do 517 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 518 eval "${nodes_to_use[node]}" 519 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 520 nodes_hp[nodes_to_use[node]]=$NRHUGE 521 fi 522 done 523 524 for node in "${!nodes_hp[@]}"; do 525 if [[ -z ${nodes[node]} ]]; then 526 echo "Node $node doesn't exist, ignoring" >&2 527 continue 528 fi 529 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 530 done 531} 532 533function configure_linux() { 534 configure_linux_pci 535 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 536 537 if [ -z "$hugetlbfs_mounts" ]; then 538 hugetlbfs_mounts=/mnt/huge 539 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 540 mkdir -p "$hugetlbfs_mounts" 541 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 542 fi 543 544 configure_linux_hugepages 545 546 if [ "$driver_name" = "vfio-pci" ]; then 547 if [ -n "$TARGET_USER" ]; then 548 for mount in $hugetlbfs_mounts; do 549 chown "$TARGET_USER" "$mount" 550 chmod g+w "$mount" 551 done 552 553 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 554 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 555 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 556 cat <<- MEMLOCK 557 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 558 559 This is the maximum amount of memory you will be 560 able to use with DPDK and VFIO if run as user "$TARGET_USER". 561 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 562 MEMLOCK 563 if ((MEMLOCK_AMNT < 65536)); then 564 echo "" 565 echo "## WARNING: memlock limit is less than 64MB" 566 echo -n "## DPDK with VFIO may not be able to initialize " 567 echo "if run as user \"$TARGET_USER\"." 568 fi 569 fi 570 fi 571 fi 572 573 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 574 # Some distros build msr as a module. Make sure it's loaded to ensure 575 # DPDK can easily figure out the TSC rate rather than relying on 100ms 576 # sleeps. 577 modprobe msr &> /dev/null || true 578 fi 579} 580 581function reset_linux_pci() { 582 # virtio 583 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 584 # Requires some more investigation - for example, some kernels do not seem to have 585 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 586 # underscore vs. dash right in the virtio_scsi name. 587 modprobe virtio-pci || true 588 for bdf in "${!all_devices_d[@]}"; do 589 ((all_devices_d["$bdf"] == 0)) || continue 590 591 driver=$(collect_driver "$bdf") 592 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 593 linux_bind_driver "$bdf" "$driver" 594 else 595 linux_unbind_driver "$bdf" 596 fi 597 done 598 599 echo "1" > "/sys/bus/pci/rescan" 600} 601 602function reset_linux() { 603 reset_linux_pci 604 for mount in $(linux_hugetlbfs_mounts); do 605 for hp in "$mount"/spdk*map_*; do 606 flock -n "$hp" true && rm -f "$hp" 607 done 608 done 609 rm -f /run/.spdk* 610} 611 612function status_linux() { 613 echo "Hugepages" >&2 614 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 615 616 numa_nodes=0 617 shopt -s nullglob 618 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 619 numa_nodes=$((numa_nodes + 1)) 620 free_pages=$(cat $path/free_hugepages) 621 all_pages=$(cat $path/nr_hugepages) 622 623 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 624 625 node=${BASH_REMATCH[1]} 626 huge_size=${BASH_REMATCH[2]} 627 628 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 629 done 630 shopt -u nullglob 631 632 # fall back to system-wide hugepages 633 if [ "$numa_nodes" = "0" ]; then 634 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 635 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 636 node="-" 637 huge_size="$HUGEPGSZ" 638 639 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 640 fi 641 642 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 643 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 644 645 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 646 647 for bdf in "${sorted_bdfs[@]}"; do 648 driver=${drivers_d["$bdf"]} 649 if [ "$numa_nodes" = "0" ]; then 650 node="-" 651 else 652 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 653 if ((node == -1)); then 654 node=unknown 655 fi 656 fi 657 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 658 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 659 else 660 name="-" 661 fi 662 663 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 664 blknames=($(get_block_dev_from_bdf "$bdf")) 665 else 666 blknames=("-") 667 fi 668 669 desc="" 670 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 671 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 672 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 673 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 674 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 675 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 676 677 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 678 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 679 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 680 done 681} 682 683function status_freebsd() { 684 local pci 685 686 status_print() ( 687 local type=$1 688 local dev driver 689 690 shift 691 692 for pci; do 693 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 694 "$type" \ 695 "$pci" \ 696 "${pci_ids_vendor["$pci"]}" \ 697 "${pci_ids_device["$pci"]}" \ 698 "${pci_bus_driver["$pci"]}" 699 done | sort -k2,2 700 ) 701 702 local contigmem=present 703 local contigmem_buffer_size 704 local contigmem_num_buffers 705 706 if ! kldstat -q -m contigmem; then 707 contigmem="not present" 708 fi 709 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 710 contigmem_buffer_size="not set" 711 fi 712 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 713 contigmem_num_buffers="not set" 714 fi 715 716 cat <<- BSD_INFO 717 Contigmem ($contigmem) 718 Buffer Size: $contigmem_buffer_size 719 Num Buffers: $contigmem_num_buffers 720 721 BSD_INFO 722 723 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 724 "Type" "BDF" "Vendor" "Device" "Driver" >&2 725 726 status_print "NVMe" "${!nvme_d[@]}" 727 status_print "I/OAT" "${!ioat_d[@]}" 728 status_print "DSA" "${!dsa_d[@]}" 729 status_print "IAA" "${!iaa_d[@]}" 730 status_print "VMD" "${!vmd_d[@]}" 731} 732 733function configure_freebsd_pci() { 734 local BDFS 735 736 BDFS+=("${!nvme_d[@]}") 737 BDFS+=("${!ioat_d[@]}") 738 BDFS+=("${!dsa_d[@]}") 739 BDFS+=("${!iaa_d[@]}") 740 BDFS+=("${!vmd_d[@]}") 741 742 # Drop the domain part from all the addresses 743 BDFS=("${BDFS[@]#*:}") 744 745 local IFS="," 746 kldunload nic_uio.ko || true 747 kenv hw.nic_uio.bdfs="${BDFS[*]}" 748 kldload nic_uio.ko 749} 750 751function configure_freebsd() { 752 if ! check_for_driver_freebsd; then 753 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 754 return 1 755 fi 756 configure_freebsd_pci 757 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 758 # previous value, unload contigmem so that we can reload with the new value. 759 if kldstat -q -m contigmem; then 760 # contigmem may be loaded, but the kernel environment doesn't have to 761 # be necessarily set at this point. If it isn't, kenv will fail to 762 # pick up the hw. options. Handle it. 763 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 764 contigmem_num_buffers=-1 765 fi 2> /dev/null 766 if ((contigmem_num_buffers != HUGEMEM / 256)); then 767 kldunload contigmem.ko 768 fi 769 fi 770 if ! kldstat -q -m contigmem; then 771 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 772 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 773 kldload contigmem.ko 774 fi 775} 776 777function reset_freebsd() { 778 kldunload contigmem.ko || true 779 kldunload nic_uio.ko || true 780} 781 782CMD=reset cache_pci_bus 783 784mode=$1 785 786if [ -z "$mode" ]; then 787 mode="config" 788fi 789 790: ${HUGEMEM:=2048} 791: ${PCI_ALLOWED:=""} 792: ${PCI_BLOCKED:=""} 793 794if [ -n "$NVME_ALLOWED" ]; then 795 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 796fi 797 798if [ -n "$SKIP_PCI" ]; then 799 PCI_ALLOWED="none" 800fi 801 802if [ -z "$TARGET_USER" ]; then 803 TARGET_USER="$SUDO_USER" 804 if [ -z "$TARGET_USER" ]; then 805 TARGET_USER=$(logname 2> /dev/null) || true 806 fi 807fi 808 809collect_devices "$mode" 810 811if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 812 # Note that this will wait only for the first block device attached to 813 # a given storage controller. For nvme this may miss some of the devs 814 # in case multiple namespaces are being in place. 815 # FIXME: Wait for nvme controller(s) to be in live state and determine 816 # number of configured namespaces, build list of potential block devs 817 # and pass them to sync_dev_uevents. Is it worth the effort? 818 bdfs_to_wait_for=() 819 for bdf in "${!all_devices_d[@]}"; do 820 ((all_devices_d["$bdf"] == 0)) || continue 821 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 822 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 823 bdfs_to_wait_for+=("$bdf") 824 fi 825 done 826 if ((${#bdfs_to_wait_for[@]} > 0)); then 827 echo "Waiting for block devices as requested" 828 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 829 "$rootdir/scripts/sync_dev_uevents.sh" \ 830 block/disk \ 831 "${bdfs_to_wait_for[@]}" & 832 sync_pid=$! 833 fi 834fi 835 836if [[ $os == Linux ]]; then 837 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 838 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 839 unset -v HUGEPGSZ 840 fi 841 842 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 843 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 844 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 845 846 if [ "$mode" == "config" ]; then 847 configure_linux 848 elif [ "$mode" == "cleanup" ]; then 849 cleanup_linux 850 clear_hugepages 851 elif [ "$mode" == "reset" ]; then 852 reset_linux 853 elif [ "$mode" == "status" ]; then 854 status_linux 855 elif [ "$mode" == "help" ]; then 856 usage $0 857 else 858 usage $0 "Invalid argument '$mode'" 859 fi 860else 861 if [ "$mode" == "config" ]; then 862 configure_freebsd 863 elif [ "$mode" == "reset" ]; then 864 reset_freebsd 865 elif [ "$mode" == "cleanup" ]; then 866 echo "setup.sh cleanup function not yet supported on $os" 867 elif [ "$mode" == "status" ]; then 868 status_freebsd 869 elif [ "$mode" == "help" ]; then 870 usage $0 871 else 872 usage $0 "Invalid argument '$mode'" 873 fi 874fi 875 876if [[ -e /proc/$sync_pid/status ]]; then 877 wait "$sync_pid" 878fi 879