1#!/usr/bin/env bash 2 3set -e 4 5os=$(uname -s) 6 7if [[ $os != Linux && $os != FreeBSD ]]; then 8 echo "Not supported platform ($os), aborting" 9 exit 1 10fi 11 12rootdir=$(readlink -f $(dirname $0))/.. 13source "$rootdir/scripts/common.sh" 14 15function usage() { 16 if [[ $os == Linux ]]; then 17 options="[config|reset|status|cleanup|help]" 18 else 19 options="[config|reset|help]" 20 fi 21 22 [[ -n $2 ]] && ( 23 echo "$2" 24 echo "" 25 ) 26 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 27 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 28 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 29 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 30 echo "Usage: $(basename $1) $options" 31 echo 32 echo "$options - as following:" 33 echo "config Default mode. Allocate hugepages and bind PCI devices." 34 if [[ $os == Linux ]]; then 35 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 36 fi 37 echo "reset Rebind PCI devices back to their original drivers." 38 echo " Also cleanup any leftover spdk files/resources." 39 echo " Hugepage memory size will remain unchanged." 40 if [[ $os == Linux ]]; then 41 echo "status Print status of all SPDK-compatible devices on the system." 42 fi 43 echo "help Print this help message." 44 echo 45 echo "The following environment variables can be specified." 46 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 47 echo " For NUMA systems, the hugepages will be distributed on node0 by" 48 echo " default." 49 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 50 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 51 echo " Uses kernel's default for hugepages size." 52 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 53 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 54 echo " separated with comas. By default, NRHUGE will be applied on each node." 55 echo " Hugepages can be defined per node with e.g.:" 56 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 57 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 58 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 59 echo " setting is used." 60 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 61 echo " be made prior to allocation". 62 echo "PCI_ALLOWED" 63 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 64 echo " Each device must be specified as a full PCI address." 65 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 66 echo " To block all PCI devices use a non-valid address." 67 echo " E.g. PCI_BLOCKED=\"none\"" 68 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 69 echo " will be bound." 70 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 71 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 72 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 73 echo " By default the current user will be used." 74 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 75 echo " bind devices to the given driver." 76 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 77 echo "PCI_BLOCK_SYNC_ON_RESET" 78 echo " If set in the environment, the attempt to wait for block devices associated" 79 echo " with given PCI device will be made upon reset" 80 exit 0 81} 82 83# In monolithic kernels the lsmod won't work. So 84# back that with a /sys/modules. We also check 85# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 86# contain needed info (like in Fedora-like OS). 87function check_for_driver() { 88 if lsmod | grep -q ${1//-/_}; then 89 return 1 90 fi 91 92 if [[ -d /sys/module/${1} || -d \ 93 /sys/module/${1//-/_} || -d \ 94 /sys/bus/pci/drivers/${1} || -d \ 95 /sys/bus/pci/drivers/${1//-/_} ]]; then 96 return 2 97 fi 98 return 0 99} 100 101function check_for_driver_freebsd() { 102 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 103 local search_paths path driver 104 IFS=";" read -ra search_paths < <(kldconfig -rU) 105 106 for driver in contigmem.ko nic_uio.ko; do 107 for path in "${search_paths[@]}"; do 108 [[ -f $path/$driver ]] && continue 2 109 done 110 return 1 111 done 112 return 0 113} 114 115function pci_dev_echo() { 116 local bdf="$1" 117 shift 118 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 119} 120 121function linux_bind_driver() { 122 bdf="$1" 123 driver_name="$2" 124 old_driver_name=${drivers_d["$bdf"]:-no driver} 125 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 126 127 if [[ $driver_name == "$old_driver_name" ]]; then 128 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 129 return 0 130 fi 131 132 if [[ $old_driver_name != "no driver" ]]; then 133 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 134 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 135 fi 136 137 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 138 139 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 140 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 141 142 if [[ $driver_name == uio_pci_generic && -e /sys/module/igb_uio ]]; then 143 # Check if the uio_pci_generic driver is broken as it might be in 144 # some 4.18.x kernels (see centos8 for instance) - if our device 145 # didn't get a proper uio entry, fallback to igb_uio 146 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 147 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 148 drivers_d["$bdf"]="no driver" 149 # This call will override $driver_name for remaining devices as well 150 linux_bind_driver "$bdf" igb_uio 151 fi 152 fi 153 154 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 155 if [ -e "/dev/vfio/$iommu_group" ]; then 156 if [ -n "$TARGET_USER" ]; then 157 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 158 fi 159 fi 160} 161 162function linux_unbind_driver() { 163 local bdf="$1" 164 local ven_dev_id 165 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 166 local old_driver_name=${drivers_d["$bdf"]:-no driver} 167 168 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 169 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 170 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 171 fi 172 173 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 174} 175 176function linux_hugetlbfs_mounts() { 177 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 178} 179 180function get_block_dev_from_bdf() { 181 local bdf=$1 182 local block 183 184 for block in /sys/block/*; do 185 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 186 echo "${block##*/}" 187 fi 188 done 189} 190 191function get_mounted_part_dev_from_bdf_block() { 192 local bdf=$1 193 local blocks block dev mount 194 195 hash lsblk || return 1 196 blocks=($(get_block_dev_from_bdf "$bdf")) 197 198 for block in "${blocks[@]}"; do 199 while read -r dev mount; do 200 if [[ -e $mount ]]; then 201 echo "$block:$dev" 202 fi 203 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 204 done 205} 206 207function collect_devices() { 208 # NVMe, IOAT, IDXD, VIRTIO, VMD 209 210 local ids dev_type dev_id bdf bdfs in_use driver 211 212 ids+="PCI_DEVICE_ID_INTEL_IOAT" 213 ids+="|PCI_DEVICE_ID_INTEL_IDXD" 214 ids+="|PCI_DEVICE_ID_VIRTIO" 215 ids+="|PCI_DEVICE_ID_INTEL_VMD" 216 ids+="|SPDK_PCI_CLASS_NVME" 217 218 local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d 219 220 while read -r _ dev_type dev_id; do 221 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 222 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 223 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 224 [[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 225 for bdf in "${bdfs[@]}"; do 226 in_use=0 227 if [[ $1 != status ]]; then 228 if ! pci_can_use "$bdf"; then 229 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 230 in_use=1 231 fi 232 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 233 if ! verify_bdf_mounts "$bdf"; then 234 in_use=1 235 fi 236 fi 237 if [[ $dev_type == vmd ]]; then 238 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 239 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 240 in_use=1 241 fi 242 fi 243 fi 244 eval "${dev_type}_d[$bdf]=$in_use" 245 all_devices_d["$bdf"]=$in_use 246 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 247 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 248 drivers_d["$bdf"]=${driver##*/} 249 fi 250 done 251 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 252} 253 254function collect_driver() { 255 local bdf=$1 256 local drivers driver 257 258 [[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1 259 if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 260 # Pick first entry in case multiple aliases are bound to a driver. 261 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 262 driver=${driver##*/} 263 else 264 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 265 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 266 [[ -n ${idxd_d["$bdf"]} ]] && driver=idxd 267 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 268 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 269 fi 2> /dev/null 270 echo "$driver" 271} 272 273function verify_bdf_mounts() { 274 local bdf=$1 275 local blknames 276 blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) || return 1 277 278 if ((${#blknames[@]} > 0)); then 279 local IFS="," 280 pci_dev_echo "$bdf" "Active mountpoints on ${blknames[*]}, so not binding PCI dev" 281 return 1 282 fi 283} 284 285function configure_linux_pci() { 286 local driver_path="" 287 driver_name="" 288 igb_uio_fallback="" 289 290 # igb_uio is a common driver to override with and it depends on uio. 291 modprobe uio 292 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 293 igb_uio_fallback=$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko 294 insmod "$igb_uio_fallback" || true 295 fi 296 297 if [[ -n "${DRIVER_OVERRIDE}" ]]; then 298 driver_path="$DRIVER_OVERRIDE" 299 driver_name="${DRIVER_OVERRIDE##*/}" 300 # modprobe and the sysfs don't use the .ko suffix. 301 driver_name=${driver_name%.ko} 302 # path = name -> there is no path 303 if [[ "$driver_path" = "$driver_name" ]]; then 304 driver_path="" 305 fi 306 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 307 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 308 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 309 driver_name=vfio-pci 310 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 311 # should be done automatically by modprobe since this particular module should 312 # be a part of vfio-pci dependencies, however, on some distros, it seems that 313 # it's not the case. See #1689. 314 if modinfo vfio_iommu_type1 > /dev/null; then 315 modprobe vfio_iommu_type1 316 fi 317 elif modinfo uio_pci_generic > /dev/null 2>&1; then 318 driver_name=uio_pci_generic 319 elif [[ -e $igb_uio_fallback ]]; then 320 driver_name="igb_uio" 321 echo "WARNING: uio_pci_generic not detected - using $driver_name" 322 else 323 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please either enable the vfio-pci or uio_pci_generic" 324 echo "kernel modules, or have SPDK build the igb_uio driver by running ./configure --with-igb-uio-driver and recompiling." 325 return 1 326 fi 327 328 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 329 if [[ -n "$driver_path" ]]; then 330 insmod $driver_path || true 331 else 332 modprobe $driver_name 333 fi 334 335 for bdf in "${!all_devices_d[@]}"; do 336 if ((all_devices_d["$bdf"] == 0)); then 337 if [[ -n ${nvme_d["$bdf"]} ]]; then 338 # Some nvme controllers may take significant amount of time while being 339 # unbound from the driver. Put that task into background to speed up the 340 # whole process. Currently this is done only for the devices bound to the 341 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 342 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 343 linux_bind_driver "$bdf" "$driver_name" & 344 else 345 linux_bind_driver "$bdf" "$driver_name" 346 fi 347 fi 348 done 349 wait 350 351 echo "1" > "/sys/bus/pci/rescan" 352} 353 354function cleanup_linux() { 355 shopt -s extglob nullglob 356 dirs_to_clean="" 357 dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) " 358 if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then 359 dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) " 360 fi 361 362 files_to_clean="" 363 for dir in $dirs_to_clean; do 364 files_to_clean+="$(echo $dir/*) " 365 done 366 shopt -u extglob nullglob 367 368 files_to_clean+="$(ls -1 /dev/shm/* \ 369 | grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) " 370 files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)" 371 if [[ -z "$files_to_clean" ]]; then 372 echo "Clean" 373 return 0 374 fi 375 376 shopt -s extglob 377 for fd_dir in $(echo /proc/+([0-9])); do 378 opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)" 379 done 380 shopt -u extglob 381 382 if [[ -z "$opened_files" ]]; then 383 echo "Can't get list of opened files!" 384 exit 1 385 fi 386 387 echo 'Cleaning' 388 for f in $files_to_clean; do 389 if ! echo "$opened_files" | grep -E -q "^$f\$"; then 390 echo "Removing: $f" 391 rm $f 392 else 393 echo "Still open: $f" 394 fi 395 done 396 397 for dir in $dirs_to_clean; do 398 if ! echo "$opened_files" | grep -E -q "^$dir\$"; then 399 echo "Removing: $dir" 400 rmdir $dir 401 else 402 echo "Still open: $dir" 403 fi 404 done 405 echo "Clean" 406 407 unset dirs_to_clean files_to_clean opened_files 408} 409 410check_hugepages_alloc() { 411 local hp_int=$1 412 local allocated_hugepages 413 414 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 415 416 allocated_hugepages=$(< "$hp_int") 417 if ((allocated_hugepages < NRHUGE)); then 418 cat <<- ERROR 419 420 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 421 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 422 ERROR 423 return 1 424 fi 425} 426 427clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 428 429configure_linux_hugepages() { 430 local node system_nodes 431 local nodes_to_use nodes_hp 432 433 if [[ $CLEAR_HUGE == yes ]]; then 434 clear_hugepages 435 fi 436 437 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 438 clear_hugepages 439 check_hugepages_alloc /proc/sys/vm/nr_hugepages 440 return 0 441 fi 442 443 for node in /sys/devices/system/node/node*; do 444 [[ -e $node ]] || continue 445 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 446 done 447 448 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 449 if ((${#nodes_to_use[@]} == 0)); then 450 nodes_to_use[0]=0 451 fi 452 453 # Align indexes with node ids 454 for node in "${!nodes_to_use[@]}"; do 455 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 456 eval "${nodes_to_use[node]}" 457 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 458 nodes_hp[nodes_to_use[node]]=$NRHUGE 459 fi 460 done 461 462 for node in "${!nodes_hp[@]}"; do 463 if [[ -z ${nodes[node]} ]]; then 464 echo "Node $node doesn't exist, ignoring" >&2 465 continue 466 fi 467 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 468 done 469} 470 471function configure_linux() { 472 configure_linux_pci 473 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 474 475 if [ -z "$hugetlbfs_mounts" ]; then 476 hugetlbfs_mounts=/mnt/huge 477 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 478 mkdir -p "$hugetlbfs_mounts" 479 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 480 fi 481 482 configure_linux_hugepages 483 484 if [ "$driver_name" = "vfio-pci" ]; then 485 if [ -n "$TARGET_USER" ]; then 486 for mount in $hugetlbfs_mounts; do 487 chown "$TARGET_USER" "$mount" 488 chmod g+w "$mount" 489 done 490 491 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 492 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 493 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 494 cat <<- MEMLOCK 495 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 496 497 This is the maximum amount of memory you will be 498 able to use with DPDK and VFIO if run as user "$TARGET_USER". 499 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 500 MEMLOCK 501 if ((MEMLOCK_AMNT < 65536)); then 502 echo "" 503 echo "## WARNING: memlock limit is less than 64MB" 504 echo -n "## DPDK with VFIO may not be able to initialize " 505 echo "if run as user \"$TARGET_USER\"." 506 fi 507 fi 508 fi 509 fi 510 511 if [ ! -f /dev/cpu/0/msr ]; then 512 # Some distros build msr as a module. Make sure it's loaded to ensure 513 # DPDK can easily figure out the TSC rate rather than relying on 100ms 514 # sleeps. 515 modprobe msr || true 516 fi 517} 518 519function reset_linux_pci() { 520 # virtio 521 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 522 # Requires some more investigation - for example, some kernels do not seem to have 523 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 524 # underscore vs. dash right in the virtio_scsi name. 525 modprobe virtio-pci || true 526 for bdf in "${!all_devices_d[@]}"; do 527 ((all_devices_d["$bdf"] == 0)) || continue 528 529 driver=$(collect_driver "$bdf") 530 if ! check_for_driver "$driver"; then 531 linux_bind_driver "$bdf" "$driver" 532 else 533 linux_unbind_driver "$bdf" 534 fi 535 done 536 537 echo "1" > "/sys/bus/pci/rescan" 538} 539 540function reset_linux() { 541 reset_linux_pci 542 for mount in $(linux_hugetlbfs_mounts); do 543 for hp in "$mount"/spdk*map_*; do 544 flock -n "$hp" true && rm -f "$hp" 545 done 546 done 547 rm -f /run/.spdk* 548} 549 550function status_linux() { 551 echo "Hugepages" >&2 552 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 553 554 numa_nodes=0 555 shopt -s nullglob 556 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 557 numa_nodes=$((numa_nodes + 1)) 558 free_pages=$(cat $path/free_hugepages) 559 all_pages=$(cat $path/nr_hugepages) 560 561 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 562 563 node=${BASH_REMATCH[1]} 564 huge_size=${BASH_REMATCH[2]} 565 566 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 567 done 568 shopt -u nullglob 569 570 # fall back to system-wide hugepages 571 if [ "$numa_nodes" = "0" ]; then 572 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 573 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 574 node="-" 575 huge_size="$HUGEPGSZ" 576 577 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 578 fi 579 580 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 581 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 582 583 for bdf in "${!all_devices_d[@]}"; do 584 driver=${drivers_d["$bdf"]} 585 if [ "$numa_nodes" = "0" ]; then 586 node="-" 587 else 588 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 589 if ((node == -1)); then 590 node=unknown 591 fi 592 fi 593 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 594 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 595 else 596 name="-" 597 fi 598 599 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 600 blknames=($(get_block_dev_from_bdf "$bdf")) 601 else 602 blknames=("-") 603 fi 604 605 desc="" 606 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 607 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 608 desc=${desc:-${idxd_d["$bdf"]:+IDXD}} 609 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 610 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 611 612 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 613 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 614 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 615 done | sort -bk2,2 616} 617 618function status_freebsd() { 619 local pci 620 621 status_print() ( 622 local dev driver 623 624 echo -e "BDF\t\tVendor\tDevice\tDriver" 625 626 for pci; do 627 driver=$(pciconf -l "pci$pci") 628 driver=${driver%@*} 629 printf '%s\t%s\t%s\t%s\n' \ 630 "$pci" \ 631 "${pci_ids_vendor["$pci"]}" \ 632 "${pci_ids_device["$pci"]}" \ 633 "$driver" 634 done | sort -k1,1 635 ) 636 637 local contigmem=present 638 local contigmem_buffer_size 639 local contigmem_num_buffers 640 641 if ! kldstat -q -m contigmem; then 642 contigmem="not present" 643 fi 644 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 645 contigmem_buffer_size="not set" 646 fi 647 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 648 contigmem_num_buffers="not set" 649 fi 650 651 cat <<- BSD_INFO 652 Contigmem ($contigmem) 653 Buffer Size: $contigmem_buffer_size 654 Num Buffers: $contigmem_num_buffers 655 656 NVMe devices 657 $(status_print "${!nvme_d[@]}") 658 659 I/IOAT DMA 660 $(status_print "${!ioat_d[@]}") 661 662 IDXD DMA 663 $(status_print "${!idxd_d[@]}") 664 665 VMD 666 $(status_print "${!vmd_d[@]}") 667 BSD_INFO 668} 669 670function configure_freebsd_pci() { 671 local BDFS 672 673 BDFS+=("${!nvme_d[@]}") 674 BDFS+=("${!ioat_d[@]}") 675 BDFS+=("${!idxd_d[@]}") 676 BDFS+=("${!vmd_d[@]}") 677 678 # Drop the domain part from all the addresses 679 BDFS=("${BDFS[@]#*:}") 680 681 local IFS="," 682 kldunload nic_uio.ko || true 683 kenv hw.nic_uio.bdfs="${BDFS[*]}" 684 kldload nic_uio.ko 685} 686 687function configure_freebsd() { 688 if ! check_for_driver_freebsd; then 689 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 690 return 1 691 fi 692 configure_freebsd_pci 693 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 694 # previous value, unload contigmem so that we can reload with the new value. 695 if kldstat -q -m contigmem; then 696 # contigmem may be loaded, but the kernel environment doesn't have to 697 # be necessarily set at this point. If it isn't, kenv will fail to 698 # pick up the hw. options. Handle it. 699 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 700 contigmem_num_buffers=-1 701 fi 2> /dev/null 702 if ((contigmem_num_buffers != HUGEMEM / 256)); then 703 kldunload contigmem.ko 704 fi 705 fi 706 if ! kldstat -q -m contigmem; then 707 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 708 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 709 kldload contigmem.ko 710 fi 711} 712 713function reset_freebsd() { 714 kldunload contigmem.ko || true 715 kldunload nic_uio.ko || true 716} 717 718CMD=reset cache_pci_bus 719 720mode=$1 721 722if [ -z "$mode" ]; then 723 mode="config" 724fi 725 726: ${HUGEMEM:=2048} 727: ${PCI_ALLOWED:=""} 728: ${PCI_BLOCKED:=""} 729 730if [ -n "$NVME_ALLOWED" ]; then 731 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 732fi 733 734if [ -n "$SKIP_PCI" ]; then 735 PCI_ALLOWED="none" 736fi 737 738if [ -z "$TARGET_USER" ]; then 739 TARGET_USER="$SUDO_USER" 740 if [ -z "$TARGET_USER" ]; then 741 TARGET_USER=$(logname 2> /dev/null) || true 742 fi 743fi 744 745collect_devices "$mode" 746 747if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 748 # Note that this will wait only for the first block device attached to 749 # a given storage controller. For nvme this may miss some of the devs 750 # in case multiple namespaces are being in place. 751 # FIXME: Wait for nvme controller(s) to be in live state and determine 752 # number of configured namespaces, build list of potential block devs 753 # and pass them to sync_dev_uevents. Is it worth the effort? 754 bdfs_to_wait_for=() 755 for bdf in "${!all_devices_d[@]}"; do 756 ((all_devices_d["$bdf"] == 0)) || continue 757 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 758 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 759 bdfs_to_wait_for+=("$bdf") 760 fi 761 done 762 if ((${#bdfs_to_wait_for[@]} > 0)); then 763 echo "Waiting for block devices as requested" 764 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 765 "$rootdir/scripts/sync_dev_uevents.sh" \ 766 block/disk \ 767 "${bdfs_to_wait_for[@]}" & 768 sync_pid=$! 769 fi 770fi 771 772if [[ $os == Linux ]]; then 773 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 774 echo "${HUGEPGSZ}kB is not supported by the running kernel, ingoring" >&2 775 unset -v HUGEPGSZ 776 fi 777 778 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 779 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 780 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 781 782 if [ "$mode" == "config" ]; then 783 configure_linux 784 elif [ "$mode" == "cleanup" ]; then 785 cleanup_linux 786 elif [ "$mode" == "reset" ]; then 787 reset_linux 788 elif [ "$mode" == "status" ]; then 789 status_linux 790 elif [ "$mode" == "help" ]; then 791 usage $0 792 else 793 usage $0 "Invalid argument '$mode'" 794 fi 795else 796 if [ "$mode" == "config" ]; then 797 configure_freebsd 798 elif [ "$mode" == "reset" ]; then 799 reset_freebsd 800 elif [ "$mode" == "cleanup" ]; then 801 echo "setup.sh cleanup function not yet supported on $os" 802 elif [ "$mode" == "status" ]; then 803 status_freebsd 804 elif [ "$mode" == "help" ]; then 805 usage $0 806 else 807 usage $0 "Invalid argument '$mode'" 808 fi 809fi 810 811if [[ -e /proc/$sync_pid/status ]]; then 812 wait "$sync_pid" 813fi 814