1#!/usr/bin/env bash 2 3set -e 4 5os=$(uname -s) 6 7if [[ $os != Linux && $os != FreeBSD ]]; then 8 echo "Not supported platform ($os), aborting" 9 exit 1 10fi 11 12rootdir=$(readlink -f $(dirname $0))/.. 13source "$rootdir/scripts/common.sh" 14 15function usage() { 16 if [[ $os == Linux ]]; then 17 options="[config|reset|status|cleanup|help]" 18 else 19 options="[config|reset|help]" 20 fi 21 22 [[ -n $2 ]] && ( 23 echo "$2" 24 echo "" 25 ) 26 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 27 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 28 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 29 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 30 echo "Usage: $(basename $1) $options" 31 echo 32 echo "$options - as following:" 33 echo "config Default mode. Allocate hugepages and bind PCI devices." 34 if [[ $os == Linux ]]; then 35 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 36 fi 37 echo "reset Rebind PCI devices back to their original drivers." 38 echo " Also cleanup any leftover spdk files/resources." 39 echo " Hugepage memory size will remain unchanged." 40 if [[ $os == Linux ]]; then 41 echo "status Print status of all SPDK-compatible devices on the system." 42 fi 43 echo "help Print this help message." 44 echo 45 echo "The following environment variables can be specified." 46 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 47 echo " For NUMA systems, the hugepages will be distributed on node0 by" 48 echo " default." 49 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 50 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 51 echo " Uses kernel's default for hugepages size." 52 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 53 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 54 echo " separated with comas. By default, NRHUGE will be applied on each node." 55 echo " Hugepages can be defined per node with e.g.:" 56 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 57 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 58 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 59 echo " setting is used." 60 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 61 echo " be made prior to allocation". 62 echo "PCI_ALLOWED" 63 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 64 echo " Each device must be specified as a full PCI address." 65 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 66 echo " To block all PCI devices use a non-valid address." 67 echo " E.g. PCI_BLOCKED=\"none\"" 68 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 69 echo " will be bound." 70 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 71 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 72 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 73 echo " By default the current user will be used." 74 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 75 echo " bind devices to the given driver." 76 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 77 echo "PCI_BLOCK_SYNC_ON_RESET" 78 echo " If set in the environment, the attempt to wait for block devices associated" 79 echo " with given PCI device will be made upon reset" 80 exit 0 81} 82 83# In monolithic kernels the lsmod won't work. So 84# back that with a /sys/modules. We also check 85# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 86# contain needed info (like in Fedora-like OS). 87function check_for_driver() { 88 if lsmod | grep -q ${1//-/_}; then 89 return 1 90 fi 91 92 if [[ -d /sys/module/${1} || -d \ 93 /sys/module/${1//-/_} || -d \ 94 /sys/bus/pci/drivers/${1} || -d \ 95 /sys/bus/pci/drivers/${1//-/_} ]]; then 96 return 2 97 fi 98 return 0 99} 100 101function check_for_driver_freebsd() { 102 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 103 local search_paths path driver 104 IFS=";" read -ra search_paths < <(kldconfig -rU) 105 106 for driver in contigmem.ko nic_uio.ko; do 107 for path in "${search_paths[@]}"; do 108 [[ -f $path/$driver ]] && continue 2 109 done 110 return 1 111 done 112 return 0 113} 114 115function pci_dev_echo() { 116 local bdf="$1" 117 shift 118 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 119} 120 121function linux_bind_driver() { 122 bdf="$1" 123 driver_name="$2" 124 old_driver_name=${drivers_d["$bdf"]:-no driver} 125 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 126 127 if [[ $driver_name == "$old_driver_name" ]]; then 128 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 129 return 0 130 fi 131 132 if [[ $old_driver_name != "no driver" ]]; then 133 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 134 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 135 fi 136 137 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 138 139 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 140 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 141 142 if [[ $driver_name == uio_pci_generic && -e /sys/module/igb_uio ]]; then 143 # Check if the uio_pci_generic driver is broken as it might be in 144 # some 4.18.x kernels (see centos8 for instance) - if our device 145 # didn't get a proper uio entry, fallback to igb_uio 146 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 147 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 148 drivers_d["$bdf"]="no driver" 149 # This call will override $driver_name for remaining devices as well 150 linux_bind_driver "$bdf" igb_uio 151 fi 152 fi 153 154 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 155 if [ -e "/dev/vfio/$iommu_group" ]; then 156 if [ -n "$TARGET_USER" ]; then 157 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 158 fi 159 fi 160} 161 162function linux_unbind_driver() { 163 local bdf="$1" 164 local ven_dev_id 165 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 166 local old_driver_name=${drivers_d["$bdf"]:-no driver} 167 168 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 169 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 170 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 171 fi 172 173 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 174} 175 176function linux_hugetlbfs_mounts() { 177 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 178} 179 180function get_block_dev_from_bdf() { 181 local bdf=$1 182 local block 183 184 for block in /sys/block/*; do 185 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 186 echo "${block##*/}" 187 fi 188 done 189} 190 191function get_mounted_part_dev_from_bdf_block() { 192 local bdf=$1 193 local blocks block dev mount 194 195 hash lsblk || return 1 196 blocks=($(get_block_dev_from_bdf "$bdf")) 197 198 for block in "${blocks[@]}"; do 199 while read -r dev mount; do 200 if [[ -e $mount ]]; then 201 echo "$block:$dev" 202 fi 203 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 204 done 205} 206 207function collect_devices() { 208 # NVMe, IOAT, IDXD, VIRTIO, VMD 209 210 local ids dev_type dev_id bdf bdfs in_use driver 211 212 ids+="PCI_DEVICE_ID_INTEL_IOAT" 213 ids+="|PCI_DEVICE_ID_INTEL_IDXD" 214 ids+="|PCI_DEVICE_ID_VIRTIO" 215 ids+="|PCI_DEVICE_ID_INTEL_VMD" 216 ids+="|SPDK_PCI_CLASS_NVME" 217 218 local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d 219 220 while read -r _ dev_type dev_id; do 221 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 222 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 223 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 224 [[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 225 for bdf in "${bdfs[@]}"; do 226 in_use=0 227 if [[ $1 != status ]]; then 228 if ! pci_can_use "$bdf"; then 229 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 230 in_use=1 231 fi 232 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 233 if ! verify_bdf_mounts "$bdf"; then 234 in_use=1 235 fi 236 fi 237 if [[ $dev_type == vmd ]]; then 238 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 239 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 240 in_use=1 241 fi 242 fi 243 fi 244 eval "${dev_type}_d[$bdf]=$in_use" 245 all_devices_d["$bdf"]=$in_use 246 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 247 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 248 drivers_d["$bdf"]=${driver##*/} 249 fi 250 done 251 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 252} 253 254function collect_driver() { 255 local bdf=$1 256 local drivers driver 257 258 [[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1 259 if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 260 # Pick first entry in case multiple aliases are bound to a driver. 261 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 262 driver=${driver##*/} 263 else 264 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 265 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 266 [[ -n ${idxd_d["$bdf"]} ]] && driver=idxd 267 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 268 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 269 fi 2> /dev/null 270 echo "$driver" 271} 272 273function verify_bdf_mounts() { 274 local bdf=$1 275 local blknames 276 blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) || return 1 277 278 if ((${#blknames[@]} > 0)); then 279 local IFS="," 280 pci_dev_echo "$bdf" "Active mountpoints on ${blknames[*]}, so not binding PCI dev" 281 return 1 282 fi 283} 284 285function configure_linux_pci() { 286 local driver_path="" 287 driver_name="" 288 igb_uio_fallback="" 289 290 # igb_uio is a common driver to override with and it depends on uio. 291 modprobe uio 292 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 293 igb_uio_fallback=$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko 294 insmod "$igb_uio_fallback" || true 295 fi 296 297 if [[ -n "${DRIVER_OVERRIDE}" ]]; then 298 driver_path="$DRIVER_OVERRIDE" 299 driver_name="${DRIVER_OVERRIDE##*/}" 300 # modprobe and the sysfs don't use the .ko suffix. 301 driver_name=${driver_name%.ko} 302 # path = name -> there is no path 303 if [[ "$driver_path" = "$driver_name" ]]; then 304 driver_path="" 305 fi 306 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 307 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 308 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 309 driver_name=vfio-pci 310 elif modinfo uio_pci_generic > /dev/null 2>&1; then 311 driver_name=uio_pci_generic 312 elif [[ -e $igb_uio_fallback ]]; then 313 driver_name="igb_uio" 314 echo "WARNING: uio_pci_generic not detected - using $driver_name" 315 else 316 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please either enable the vfio-pci or uio_pci_generic" 317 echo "kernel modules, or have SPDK build the igb_uio driver by running ./configure --with-igb-uio-driver and recompiling." 318 return 1 319 fi 320 321 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 322 if [[ -n "$driver_path" ]]; then 323 insmod $driver_path || true 324 else 325 modprobe $driver_name 326 fi 327 328 for bdf in "${!all_devices_d[@]}"; do 329 if ((all_devices_d["$bdf"] == 0)); then 330 if [[ -n ${nvme_d["$bdf"]} ]]; then 331 # Some nvme controllers may take significant amount of time while being 332 # unbound from the driver. Put that task into background to speed up the 333 # whole process. Currently this is done only for the devices bound to the 334 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 335 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 336 linux_bind_driver "$bdf" "$driver_name" & 337 else 338 linux_bind_driver "$bdf" "$driver_name" 339 fi 340 fi 341 done 342 wait 343 344 echo "1" > "/sys/bus/pci/rescan" 345} 346 347function cleanup_linux() { 348 shopt -s extglob nullglob 349 dirs_to_clean="" 350 dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) " 351 if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then 352 dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) " 353 fi 354 355 files_to_clean="" 356 for dir in $dirs_to_clean; do 357 files_to_clean+="$(echo $dir/*) " 358 done 359 shopt -u extglob nullglob 360 361 files_to_clean+="$(ls -1 /dev/shm/* \ 362 | grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) " 363 files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)" 364 if [[ -z "$files_to_clean" ]]; then 365 echo "Clean" 366 return 0 367 fi 368 369 shopt -s extglob 370 for fd_dir in $(echo /proc/+([0-9])); do 371 opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)" 372 done 373 shopt -u extglob 374 375 if [[ -z "$opened_files" ]]; then 376 echo "Can't get list of opened files!" 377 exit 1 378 fi 379 380 echo 'Cleaning' 381 for f in $files_to_clean; do 382 if ! echo "$opened_files" | grep -E -q "^$f\$"; then 383 echo "Removing: $f" 384 rm $f 385 else 386 echo "Still open: $f" 387 fi 388 done 389 390 for dir in $dirs_to_clean; do 391 if ! echo "$opened_files" | grep -E -q "^$dir\$"; then 392 echo "Removing: $dir" 393 rmdir $dir 394 else 395 echo "Still open: $dir" 396 fi 397 done 398 echo "Clean" 399 400 unset dirs_to_clean files_to_clean opened_files 401} 402 403check_hugepages_alloc() { 404 local hp_int=$1 405 local allocated_hugepages 406 407 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 408 409 allocated_hugepages=$(< "$hp_int") 410 if ((allocated_hugepages < NRHUGE)); then 411 cat <<- ERROR 412 413 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 414 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 415 ERROR 416 return 1 417 fi 418} 419 420clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 421 422configure_linux_hugepages() { 423 local node system_nodes 424 local nodes_to_use nodes_hp 425 426 if [[ $CLEAR_HUGE == yes ]]; then 427 clear_hugepages 428 fi 429 430 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 431 clear_hugepages 432 check_hugepages_alloc /proc/sys/vm/nr_hugepages 433 return 0 434 fi 435 436 for node in /sys/devices/system/node/node*; do 437 [[ -e $node ]] || continue 438 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 439 done 440 441 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 442 if ((${#nodes_to_use[@]} == 0)); then 443 nodes_to_use[0]=0 444 fi 445 446 # Align indexes with node ids 447 for node in "${!nodes_to_use[@]}"; do 448 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 449 eval "${nodes_to_use[node]}" 450 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 451 nodes_hp[nodes_to_use[node]]=$NRHUGE 452 fi 453 done 454 455 for node in "${!nodes_hp[@]}"; do 456 if [[ -z ${nodes[node]} ]]; then 457 echo "Node $node doesn't exist, ignoring" >&2 458 continue 459 fi 460 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 461 done 462} 463 464function configure_linux() { 465 configure_linux_pci 466 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 467 468 if [ -z "$hugetlbfs_mounts" ]; then 469 hugetlbfs_mounts=/mnt/huge 470 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 471 mkdir -p "$hugetlbfs_mounts" 472 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 473 fi 474 475 configure_linux_hugepages 476 477 if [ "$driver_name" = "vfio-pci" ]; then 478 if [ -n "$TARGET_USER" ]; then 479 for mount in $hugetlbfs_mounts; do 480 chown "$TARGET_USER" "$mount" 481 chmod g+w "$mount" 482 done 483 484 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 485 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 486 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 487 cat <<- MEMLOCK 488 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 489 490 This is the maximum amount of memory you will be 491 able to use with DPDK and VFIO if run as user "$TARGET_USER". 492 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 493 MEMLOCK 494 if ((MEMLOCK_AMNT < 65536)); then 495 echo "" 496 echo "## WARNING: memlock limit is less than 64MB" 497 echo -n "## DPDK with VFIO may not be able to initialize " 498 echo "if run as user \"$TARGET_USER\"." 499 fi 500 fi 501 fi 502 fi 503 504 if [ ! -f /dev/cpu/0/msr ]; then 505 # Some distros build msr as a module. Make sure it's loaded to ensure 506 # DPDK can easily figure out the TSC rate rather than relying on 100ms 507 # sleeps. 508 modprobe msr || true 509 fi 510} 511 512function reset_linux_pci() { 513 # virtio 514 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 515 # Requires some more investigation - for example, some kernels do not seem to have 516 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 517 # underscore vs. dash right in the virtio_scsi name. 518 modprobe virtio-pci || true 519 for bdf in "${!all_devices_d[@]}"; do 520 ((all_devices_d["$bdf"] == 0)) || continue 521 522 driver=$(collect_driver "$bdf") 523 if ! check_for_driver "$driver"; then 524 linux_bind_driver "$bdf" "$driver" 525 else 526 linux_unbind_driver "$bdf" 527 fi 528 done 529 530 echo "1" > "/sys/bus/pci/rescan" 531} 532 533function reset_linux() { 534 reset_linux_pci 535 for mount in $(linux_hugetlbfs_mounts); do 536 for hp in "$mount"/spdk*map_*; do 537 flock -n "$hp" true && rm -f "$hp" 538 done 539 done 540 rm -f /run/.spdk* 541} 542 543function status_linux() { 544 echo "Hugepages" >&2 545 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 546 547 numa_nodes=0 548 shopt -s nullglob 549 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 550 numa_nodes=$((numa_nodes + 1)) 551 free_pages=$(cat $path/free_hugepages) 552 all_pages=$(cat $path/nr_hugepages) 553 554 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 555 556 node=${BASH_REMATCH[1]} 557 huge_size=${BASH_REMATCH[2]} 558 559 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 560 done 561 shopt -u nullglob 562 563 # fall back to system-wide hugepages 564 if [ "$numa_nodes" = "0" ]; then 565 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 566 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 567 node="-" 568 huge_size="$HUGEPGSZ" 569 570 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 571 fi 572 573 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 574 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 575 576 for bdf in "${!all_devices_d[@]}"; do 577 driver=${drivers_d["$bdf"]} 578 if [ "$numa_nodes" = "0" ]; then 579 node="-" 580 else 581 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 582 if ((node == -1)); then 583 node=unknown 584 fi 585 fi 586 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 587 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 588 else 589 name="-" 590 fi 591 592 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 593 blknames=($(get_block_dev_from_bdf "$bdf")) 594 else 595 blknames=("-") 596 fi 597 598 desc="" 599 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 600 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 601 desc=${desc:-${idxd_d["$bdf"]:+IDXD}} 602 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 603 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 604 605 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 606 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 607 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 608 done | sort -bk2,2 609} 610 611function status_freebsd() { 612 local pci 613 614 status_print() ( 615 local dev driver 616 617 echo -e "BDF\t\tVendor\tDevice\tDriver" 618 619 for pci; do 620 driver=$(pciconf -l "pci$pci") 621 driver=${driver%@*} 622 printf '%s\t%s\t%s\t%s\n' \ 623 "$pci" \ 624 "${pci_ids_vendor["$pci"]}" \ 625 "${pci_ids_device["$pci"]}" \ 626 "$driver" 627 done | sort -k1,1 628 ) 629 630 local contigmem=present 631 local contigmem_buffer_size 632 local contigmem_num_buffers 633 634 if ! kldstat -q -m contigmem; then 635 contigmem="not present" 636 fi 637 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 638 contigmem_buffer_size="not set" 639 fi 640 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 641 contigmem_num_buffers="not set" 642 fi 643 644 cat <<- BSD_INFO 645 Contigmem ($contigmem) 646 Buffer Size: $contigmem_buffer_size 647 Num Buffers: $contigmem_num_buffers 648 649 NVMe devices 650 $(status_print "${!nvme_d[@]}") 651 652 I/IOAT DMA 653 $(status_print "${!ioat_d[@]}") 654 655 IDXD DMA 656 $(status_print "${!idxd_d[@]}") 657 658 VMD 659 $(status_print "${!vmd_d[@]}") 660 BSD_INFO 661} 662 663function configure_freebsd_pci() { 664 local BDFS 665 666 BDFS+=("${!nvme_d[@]}") 667 BDFS+=("${!ioat_d[@]}") 668 BDFS+=("${!idxd_d[@]}") 669 BDFS+=("${!vmd_d[@]}") 670 671 # Drop the domain part from all the addresses 672 BDFS=("${BDFS[@]#*:}") 673 674 local IFS="," 675 kldunload nic_uio.ko || true 676 kenv hw.nic_uio.bdfs="${BDFS[*]}" 677 kldload nic_uio.ko 678} 679 680function configure_freebsd() { 681 if ! check_for_driver_freebsd; then 682 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 683 return 1 684 fi 685 configure_freebsd_pci 686 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 687 # previous value, unload contigmem so that we can reload with the new value. 688 if kldstat -q -m contigmem; then 689 # contigmem may be loaded, but the kernel environment doesn't have to 690 # be necessarily set at this point. If it isn't, kenv will fail to 691 # pick up the hw. options. Handle it. 692 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 693 contigmem_num_buffers=-1 694 fi 2> /dev/null 695 if ((contigmem_num_buffers != HUGEMEM / 256)); then 696 kldunload contigmem.ko 697 fi 698 fi 699 if ! kldstat -q -m contigmem; then 700 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 701 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 702 kldload contigmem.ko 703 fi 704} 705 706function reset_freebsd() { 707 kldunload contigmem.ko || true 708 kldunload nic_uio.ko || true 709} 710 711CMD=reset cache_pci_bus 712 713mode=$1 714 715if [ -z "$mode" ]; then 716 mode="config" 717fi 718 719: ${HUGEMEM:=2048} 720: ${PCI_ALLOWED:=""} 721: ${PCI_BLOCKED:=""} 722 723if [ -n "$NVME_ALLOWED" ]; then 724 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 725fi 726 727if [ -n "$SKIP_PCI" ]; then 728 PCI_ALLOWED="none" 729fi 730 731if [ -z "$TARGET_USER" ]; then 732 TARGET_USER="$SUDO_USER" 733 if [ -z "$TARGET_USER" ]; then 734 TARGET_USER=$(logname 2> /dev/null) || true 735 fi 736fi 737 738collect_devices "$mode" 739 740if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 741 # Note that this will wait only for the first block device attached to 742 # a given storage controller. For nvme this may miss some of the devs 743 # in case multiple namespaces are being in place. 744 # FIXME: Wait for nvme controller(s) to be in live state and determine 745 # number of configured namespaces, build list of potential block devs 746 # and pass them to sync_dev_uevents. Is it worth the effort? 747 bdfs_to_wait_for=() 748 for bdf in "${!all_devices_d[@]}"; do 749 ((all_devices_d["$bdf"] == 0)) || continue 750 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 751 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 752 bdfs_to_wait_for+=("$bdf") 753 fi 754 done 755 if ((${#bdfs_to_wait_for[@]} > 0)); then 756 echo "Waiting for block devices as requested" 757 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 758 "$rootdir/scripts/sync_dev_uevents.sh" \ 759 block/disk \ 760 "${bdfs_to_wait_for[@]}" & 761 sync_pid=$! 762 fi 763fi 764 765if [[ $os == Linux ]]; then 766 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 767 echo "${HUGEPGSZ}kB is not supported by the running kernel, ingoring" >&2 768 unset -v HUGEPGSZ 769 fi 770 771 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 772 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 773 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 774 775 if [ "$mode" == "config" ]; then 776 configure_linux 777 elif [ "$mode" == "cleanup" ]; then 778 cleanup_linux 779 elif [ "$mode" == "reset" ]; then 780 reset_linux 781 elif [ "$mode" == "status" ]; then 782 status_linux 783 elif [ "$mode" == "help" ]; then 784 usage $0 785 else 786 usage $0 "Invalid argument '$mode'" 787 fi 788else 789 if [ "$mode" == "config" ]; then 790 configure_freebsd 791 elif [ "$mode" == "reset" ]; then 792 reset_freebsd 793 elif [ "$mode" == "cleanup" ]; then 794 echo "setup.sh cleanup function not yet supported on $os" 795 elif [ "$mode" == "status" ]; then 796 status_freebsd 797 elif [ "$mode" == "help" ]; then 798 usage $0 799 else 800 usage $0 "Invalid argument '$mode'" 801 fi 802fi 803 804if [[ -e /proc/$sync_pid/status ]]; then 805 wait "$sync_pid" 806fi 807