1#!/usr/bin/env bash 2 3set -e 4 5os=$(uname -s) 6 7if [[ $os != Linux && $os != FreeBSD ]]; then 8 echo "Not supported platform ($os), aborting" 9 exit 1 10fi 11 12rootdir=$(readlink -f $(dirname $0))/.. 13source "$rootdir/scripts/common.sh" 14 15function usage() { 16 if [[ $os == Linux ]]; then 17 options="[config|reset|status|cleanup|help]" 18 else 19 options="[config|reset|help]" 20 fi 21 22 [[ -n $2 ]] && ( 23 echo "$2" 24 echo "" 25 ) 26 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 27 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 28 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 29 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 30 echo "Usage: $(basename $1) $options" 31 echo 32 echo "$options - as following:" 33 echo "config Default mode. Allocate hugepages and bind PCI devices." 34 if [[ $os == Linux ]]; then 35 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 36 fi 37 echo "reset Rebind PCI devices back to their original drivers." 38 echo " Also cleanup any leftover spdk files/resources." 39 echo " Hugepage memory size will remain unchanged." 40 if [[ $os == Linux ]]; then 41 echo "status Print status of all SPDK-compatible devices on the system." 42 fi 43 echo "help Print this help message." 44 echo 45 echo "The following environment variables can be specified." 46 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 47 echo " For NUMA systems, the hugepages will be distributed on node0 by" 48 echo " default." 49 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 50 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 51 echo " Uses kernel's default for hugepages size." 52 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 53 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 54 echo " separated with comas. By default, NRHUGE will be applied on each node." 55 echo " Hugepages can be defined per node with e.g.:" 56 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 57 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 58 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 59 echo " setting is used." 60 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 61 echo " be made prior to allocation". 62 echo "PCI_ALLOWED" 63 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 64 echo " Each device must be specified as a full PCI address." 65 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 66 echo " To block all PCI devices use a non-valid address." 67 echo " E.g. PCI_BLOCKED=\"none\"" 68 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 69 echo " will be bound." 70 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 71 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 72 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 73 echo " By default the current user will be used." 74 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 75 echo " bind devices to the given driver." 76 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 77 echo "PCI_BLOCK_SYNC_ON_RESET" 78 echo " If set in the environment, the attempt to wait for block devices associated" 79 echo " with given PCI device will be made upon reset" 80 exit 0 81} 82 83# In monolithic kernels the lsmod won't work. So 84# back that with a /sys/modules. We also check 85# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 86# contain needed info (like in Fedora-like OS). 87function check_for_driver() { 88 if lsmod | grep -q ${1//-/_}; then 89 return 1 90 fi 91 92 if [[ -d /sys/module/${1} || -d \ 93 /sys/module/${1//-/_} || -d \ 94 /sys/bus/pci/drivers/${1} || -d \ 95 /sys/bus/pci/drivers/${1//-/_} ]]; then 96 return 2 97 fi 98 return 0 99} 100 101function check_for_driver_freebsd() { 102 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 103 local search_paths path driver 104 IFS=";" read -ra search_paths < <(kldconfig -rU) 105 106 for driver in contigmem.ko nic_uio.ko; do 107 for path in "${search_paths[@]}"; do 108 [[ -f $path/$driver ]] && continue 2 109 done 110 return 1 111 done 112 return 0 113} 114 115function pci_dev_echo() { 116 local bdf="$1" 117 shift 118 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 119} 120 121function linux_bind_driver() { 122 bdf="$1" 123 driver_name="$2" 124 old_driver_name=${drivers_d["$bdf"]:-no driver} 125 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 126 127 if [[ $driver_name == "$old_driver_name" ]]; then 128 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 129 return 0 130 fi 131 132 if [[ $old_driver_name != "no driver" ]]; then 133 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 134 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 135 fi 136 137 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 138 139 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 140 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 141 142 if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then 143 # Check if the uio_pci_generic driver is broken as it might be in 144 # some 4.18.x kernels (see centos8 for instance) - if our device 145 # didn't get a proper uio entry, fallback to igb_uio 146 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 147 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 148 drivers_d["$bdf"]="no driver" 149 # This call will override $driver_name for remaining devices as well 150 linux_bind_driver "$bdf" igb_uio 151 fi 152 fi 153 154 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 155 if [ -e "/dev/vfio/$iommu_group" ]; then 156 if [ -n "$TARGET_USER" ]; then 157 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 158 fi 159 fi 160} 161 162function linux_unbind_driver() { 163 local bdf="$1" 164 local ven_dev_id 165 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 166 local old_driver_name=${drivers_d["$bdf"]:-no driver} 167 168 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 169 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 170 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 171 fi 172 173 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 174} 175 176function linux_hugetlbfs_mounts() { 177 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 178} 179 180function get_block_dev_from_bdf() { 181 local bdf=$1 182 local block 183 184 for block in /sys/block/*; do 185 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 186 echo "${block##*/}" 187 fi 188 done 189} 190 191function get_mounted_part_dev_from_bdf_block() { 192 local bdf=$1 193 local blocks block dev mount 194 195 hash lsblk || return 1 196 blocks=($(get_block_dev_from_bdf "$bdf")) 197 198 for block in "${blocks[@]}"; do 199 while read -r dev mount; do 200 if [[ -e $mount ]]; then 201 echo "$block:$dev" 202 fi 203 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 204 done 205} 206 207function collect_devices() { 208 # NVMe, IOAT, IDXD, VIRTIO, VMD 209 210 local ids dev_type dev_id bdf bdfs in_use driver 211 212 ids+="PCI_DEVICE_ID_INTEL_IOAT" 213 ids+="|PCI_DEVICE_ID_INTEL_IDXD" 214 ids+="|PCI_DEVICE_ID_VIRTIO" 215 ids+="|PCI_DEVICE_ID_INTEL_VMD" 216 ids+="|SPDK_PCI_CLASS_NVME" 217 218 local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d 219 220 while read -r _ dev_type dev_id; do 221 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 222 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 223 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 224 [[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 225 for bdf in "${bdfs[@]}"; do 226 in_use=0 227 if [[ $1 != status ]]; then 228 if ! pci_can_use "$bdf"; then 229 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 230 in_use=1 231 fi 232 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 233 if ! verify_bdf_mounts "$bdf"; then 234 in_use=1 235 fi 236 fi 237 if [[ $dev_type == vmd ]]; then 238 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 239 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 240 in_use=1 241 fi 242 fi 243 fi 244 eval "${dev_type}_d[$bdf]=$in_use" 245 all_devices_d["$bdf"]=$in_use 246 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 247 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 248 drivers_d["$bdf"]=${driver##*/} 249 fi 250 done 251 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 252} 253 254function collect_driver() { 255 local bdf=$1 256 local drivers driver 257 258 [[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1 259 if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 260 # Pick first entry in case multiple aliases are bound to a driver. 261 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 262 driver=${driver##*/} 263 else 264 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 265 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 266 [[ -n ${idxd_d["$bdf"]} ]] && driver=idxd 267 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 268 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 269 fi 2> /dev/null 270 echo "$driver" 271} 272 273function verify_bdf_mounts() { 274 local bdf=$1 275 local blknames 276 blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) || return 1 277 278 if ((${#blknames[@]} > 0)); then 279 local IFS="," 280 pci_dev_echo "$bdf" "Active mountpoints on ${blknames[*]}, so not binding PCI dev" 281 return 1 282 fi 283} 284 285function configure_linux_pci() { 286 local driver_path="" 287 driver_name="" 288 igb_uio_fallback="" 289 290 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 291 # igb_uio is a common driver to override with and it depends on uio. 292 modprobe uio || true 293 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 294 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 295 fi 296 fi 297 298 if [[ -n "${DRIVER_OVERRIDE}" ]]; then 299 driver_path="$DRIVER_OVERRIDE" 300 driver_name="${DRIVER_OVERRIDE##*/}" 301 # modprobe and the sysfs don't use the .ko suffix. 302 driver_name=${driver_name%.ko} 303 # path = name -> there is no path 304 if [[ "$driver_path" = "$driver_name" ]]; then 305 driver_path="" 306 fi 307 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 308 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 309 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 310 driver_name=vfio-pci 311 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 312 # should be done automatically by modprobe since this particular module should 313 # be a part of vfio-pci dependencies, however, on some distros, it seems that 314 # it's not the case. See #1689. 315 if modinfo vfio_iommu_type1 > /dev/null; then 316 modprobe vfio_iommu_type1 317 fi 318 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 319 driver_name=uio_pci_generic 320 elif [[ -e $igb_uio_fallback ]]; then 321 driver_path="$igb_uio_fallback" 322 driver_name="igb_uio" 323 echo "WARNING: uio_pci_generic not detected - using $driver_name" 324 else 325 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 326 return 1 327 fi 328 329 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 330 if [[ -n "$driver_path" ]]; then 331 insmod $driver_path || true 332 else 333 modprobe $driver_name 334 fi 335 336 for bdf in "${!all_devices_d[@]}"; do 337 if ((all_devices_d["$bdf"] == 0)); then 338 if [[ -n ${nvme_d["$bdf"]} ]]; then 339 # Some nvme controllers may take significant amount of time while being 340 # unbound from the driver. Put that task into background to speed up the 341 # whole process. Currently this is done only for the devices bound to the 342 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 343 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 344 linux_bind_driver "$bdf" "$driver_name" & 345 else 346 linux_bind_driver "$bdf" "$driver_name" 347 fi 348 fi 349 done 350 wait 351 352 echo "1" > "/sys/bus/pci/rescan" 353} 354 355function cleanup_linux() { 356 shopt -s extglob nullglob 357 dirs_to_clean="" 358 dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) " 359 if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then 360 dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) " 361 fi 362 363 files_to_clean="" 364 for dir in $dirs_to_clean; do 365 files_to_clean+="$(echo $dir/*) " 366 done 367 shopt -u extglob nullglob 368 369 files_to_clean+="$(ls -1 /dev/shm/* \ 370 | grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) " 371 files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)" 372 if [[ -z "$files_to_clean" ]]; then 373 echo "Clean" 374 return 0 375 fi 376 377 shopt -s extglob 378 for fd_dir in $(echo /proc/+([0-9])); do 379 opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)" 380 done 381 shopt -u extglob 382 383 if [[ -z "$opened_files" ]]; then 384 echo "Can't get list of opened files!" 385 exit 1 386 fi 387 388 echo 'Cleaning' 389 for f in $files_to_clean; do 390 if ! echo "$opened_files" | grep -E -q "^$f\$"; then 391 echo "Removing: $f" 392 rm $f 393 else 394 echo "Still open: $f" 395 fi 396 done 397 398 for dir in $dirs_to_clean; do 399 if ! echo "$opened_files" | grep -E -q "^$dir\$"; then 400 echo "Removing: $dir" 401 rmdir $dir 402 else 403 echo "Still open: $dir" 404 fi 405 done 406 echo "Clean" 407 408 unset dirs_to_clean files_to_clean opened_files 409} 410 411check_hugepages_alloc() { 412 local hp_int=$1 413 local allocated_hugepages 414 415 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 416 417 allocated_hugepages=$(< "$hp_int") 418 if ((allocated_hugepages < NRHUGE)); then 419 cat <<- ERROR 420 421 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 422 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 423 ERROR 424 return 1 425 fi 426} 427 428clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 429 430configure_linux_hugepages() { 431 local node system_nodes 432 local nodes_to_use nodes_hp 433 434 if [[ $CLEAR_HUGE == yes ]]; then 435 clear_hugepages 436 fi 437 438 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 439 clear_hugepages 440 check_hugepages_alloc /proc/sys/vm/nr_hugepages 441 return 0 442 fi 443 444 for node in /sys/devices/system/node/node*; do 445 [[ -e $node ]] || continue 446 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 447 done 448 449 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 450 if ((${#nodes_to_use[@]} == 0)); then 451 nodes_to_use[0]=0 452 fi 453 454 # Align indexes with node ids 455 for node in "${!nodes_to_use[@]}"; do 456 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 457 eval "${nodes_to_use[node]}" 458 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 459 nodes_hp[nodes_to_use[node]]=$NRHUGE 460 fi 461 done 462 463 for node in "${!nodes_hp[@]}"; do 464 if [[ -z ${nodes[node]} ]]; then 465 echo "Node $node doesn't exist, ignoring" >&2 466 continue 467 fi 468 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 469 done 470} 471 472function configure_linux() { 473 configure_linux_pci 474 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 475 476 if [ -z "$hugetlbfs_mounts" ]; then 477 hugetlbfs_mounts=/mnt/huge 478 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 479 mkdir -p "$hugetlbfs_mounts" 480 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 481 fi 482 483 configure_linux_hugepages 484 485 if [ "$driver_name" = "vfio-pci" ]; then 486 if [ -n "$TARGET_USER" ]; then 487 for mount in $hugetlbfs_mounts; do 488 chown "$TARGET_USER" "$mount" 489 chmod g+w "$mount" 490 done 491 492 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 493 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 494 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 495 cat <<- MEMLOCK 496 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 497 498 This is the maximum amount of memory you will be 499 able to use with DPDK and VFIO if run as user "$TARGET_USER". 500 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 501 MEMLOCK 502 if ((MEMLOCK_AMNT < 65536)); then 503 echo "" 504 echo "## WARNING: memlock limit is less than 64MB" 505 echo -n "## DPDK with VFIO may not be able to initialize " 506 echo "if run as user \"$TARGET_USER\"." 507 fi 508 fi 509 fi 510 fi 511 512 if [ ! -e /dev/cpu/0/msr ]; then 513 # Some distros build msr as a module. Make sure it's loaded to ensure 514 # DPDK can easily figure out the TSC rate rather than relying on 100ms 515 # sleeps. 516 modprobe msr || true 517 fi 518} 519 520function reset_linux_pci() { 521 # virtio 522 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 523 # Requires some more investigation - for example, some kernels do not seem to have 524 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 525 # underscore vs. dash right in the virtio_scsi name. 526 modprobe virtio-pci || true 527 for bdf in "${!all_devices_d[@]}"; do 528 ((all_devices_d["$bdf"] == 0)) || continue 529 530 driver=$(collect_driver "$bdf") 531 if ! check_for_driver "$driver"; then 532 linux_bind_driver "$bdf" "$driver" 533 else 534 linux_unbind_driver "$bdf" 535 fi 536 done 537 538 echo "1" > "/sys/bus/pci/rescan" 539} 540 541function reset_linux() { 542 reset_linux_pci 543 for mount in $(linux_hugetlbfs_mounts); do 544 for hp in "$mount"/spdk*map_*; do 545 flock -n "$hp" true && rm -f "$hp" 546 done 547 done 548 rm -f /run/.spdk* 549} 550 551function status_linux() { 552 echo "Hugepages" >&2 553 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 554 555 numa_nodes=0 556 shopt -s nullglob 557 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 558 numa_nodes=$((numa_nodes + 1)) 559 free_pages=$(cat $path/free_hugepages) 560 all_pages=$(cat $path/nr_hugepages) 561 562 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 563 564 node=${BASH_REMATCH[1]} 565 huge_size=${BASH_REMATCH[2]} 566 567 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 568 done 569 shopt -u nullglob 570 571 # fall back to system-wide hugepages 572 if [ "$numa_nodes" = "0" ]; then 573 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 574 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 575 node="-" 576 huge_size="$HUGEPGSZ" 577 578 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 579 fi 580 581 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 582 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 583 584 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 585 586 for bdf in "${sorted_bdfs[@]}"; do 587 driver=${drivers_d["$bdf"]} 588 if [ "$numa_nodes" = "0" ]; then 589 node="-" 590 else 591 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 592 if ((node == -1)); then 593 node=unknown 594 fi 595 fi 596 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 597 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 598 else 599 name="-" 600 fi 601 602 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 603 blknames=($(get_block_dev_from_bdf "$bdf")) 604 else 605 blknames=("-") 606 fi 607 608 desc="" 609 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 610 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 611 desc=${desc:-${idxd_d["$bdf"]:+IDXD}} 612 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 613 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 614 615 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 616 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 617 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 618 done 619} 620 621function status_freebsd() { 622 local pci 623 624 status_print() ( 625 local dev driver 626 627 echo -e "BDF\t\tVendor\tDevice\tDriver" 628 629 for pci; do 630 driver=$(pciconf -l "pci$pci") 631 driver=${driver%@*} 632 printf '%s\t%s\t%s\t%s\n' \ 633 "$pci" \ 634 "${pci_ids_vendor["$pci"]}" \ 635 "${pci_ids_device["$pci"]}" \ 636 "$driver" 637 done | sort -k1,1 638 ) 639 640 local contigmem=present 641 local contigmem_buffer_size 642 local contigmem_num_buffers 643 644 if ! kldstat -q -m contigmem; then 645 contigmem="not present" 646 fi 647 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 648 contigmem_buffer_size="not set" 649 fi 650 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 651 contigmem_num_buffers="not set" 652 fi 653 654 cat <<- BSD_INFO 655 Contigmem ($contigmem) 656 Buffer Size: $contigmem_buffer_size 657 Num Buffers: $contigmem_num_buffers 658 659 NVMe devices 660 $(status_print "${!nvme_d[@]}") 661 662 I/IOAT DMA 663 $(status_print "${!ioat_d[@]}") 664 665 IDXD DMA 666 $(status_print "${!idxd_d[@]}") 667 668 VMD 669 $(status_print "${!vmd_d[@]}") 670 BSD_INFO 671} 672 673function configure_freebsd_pci() { 674 local BDFS 675 676 BDFS+=("${!nvme_d[@]}") 677 BDFS+=("${!ioat_d[@]}") 678 BDFS+=("${!idxd_d[@]}") 679 BDFS+=("${!vmd_d[@]}") 680 681 # Drop the domain part from all the addresses 682 BDFS=("${BDFS[@]#*:}") 683 684 local IFS="," 685 kldunload nic_uio.ko || true 686 kenv hw.nic_uio.bdfs="${BDFS[*]}" 687 kldload nic_uio.ko 688} 689 690function configure_freebsd() { 691 if ! check_for_driver_freebsd; then 692 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 693 return 1 694 fi 695 configure_freebsd_pci 696 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 697 # previous value, unload contigmem so that we can reload with the new value. 698 if kldstat -q -m contigmem; then 699 # contigmem may be loaded, but the kernel environment doesn't have to 700 # be necessarily set at this point. If it isn't, kenv will fail to 701 # pick up the hw. options. Handle it. 702 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 703 contigmem_num_buffers=-1 704 fi 2> /dev/null 705 if ((contigmem_num_buffers != HUGEMEM / 256)); then 706 kldunload contigmem.ko 707 fi 708 fi 709 if ! kldstat -q -m contigmem; then 710 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 711 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 712 kldload contigmem.ko 713 fi 714} 715 716function reset_freebsd() { 717 kldunload contigmem.ko || true 718 kldunload nic_uio.ko || true 719} 720 721CMD=reset cache_pci_bus 722 723mode=$1 724 725if [ -z "$mode" ]; then 726 mode="config" 727fi 728 729: ${HUGEMEM:=2048} 730: ${PCI_ALLOWED:=""} 731: ${PCI_BLOCKED:=""} 732 733if [ -n "$NVME_ALLOWED" ]; then 734 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 735fi 736 737if [ -n "$SKIP_PCI" ]; then 738 PCI_ALLOWED="none" 739fi 740 741if [ -z "$TARGET_USER" ]; then 742 TARGET_USER="$SUDO_USER" 743 if [ -z "$TARGET_USER" ]; then 744 TARGET_USER=$(logname 2> /dev/null) || true 745 fi 746fi 747 748collect_devices "$mode" 749 750if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 751 # Note that this will wait only for the first block device attached to 752 # a given storage controller. For nvme this may miss some of the devs 753 # in case multiple namespaces are being in place. 754 # FIXME: Wait for nvme controller(s) to be in live state and determine 755 # number of configured namespaces, build list of potential block devs 756 # and pass them to sync_dev_uevents. Is it worth the effort? 757 bdfs_to_wait_for=() 758 for bdf in "${!all_devices_d[@]}"; do 759 ((all_devices_d["$bdf"] == 0)) || continue 760 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 761 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 762 bdfs_to_wait_for+=("$bdf") 763 fi 764 done 765 if ((${#bdfs_to_wait_for[@]} > 0)); then 766 echo "Waiting for block devices as requested" 767 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 768 "$rootdir/scripts/sync_dev_uevents.sh" \ 769 block/disk \ 770 "${bdfs_to_wait_for[@]}" & 771 sync_pid=$! 772 fi 773fi 774 775if [[ $os == Linux ]]; then 776 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 777 echo "${HUGEPGSZ}kB is not supported by the running kernel, ingoring" >&2 778 unset -v HUGEPGSZ 779 fi 780 781 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 782 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 783 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 784 785 if [ "$mode" == "config" ]; then 786 configure_linux 787 elif [ "$mode" == "cleanup" ]; then 788 cleanup_linux 789 elif [ "$mode" == "reset" ]; then 790 reset_linux 791 elif [ "$mode" == "status" ]; then 792 status_linux 793 elif [ "$mode" == "help" ]; then 794 usage $0 795 else 796 usage $0 "Invalid argument '$mode'" 797 fi 798else 799 if [ "$mode" == "config" ]; then 800 configure_freebsd 801 elif [ "$mode" == "reset" ]; then 802 reset_freebsd 803 elif [ "$mode" == "cleanup" ]; then 804 echo "setup.sh cleanup function not yet supported on $os" 805 elif [ "$mode" == "status" ]; then 806 status_freebsd 807 elif [ "$mode" == "help" ]; then 808 usage $0 809 else 810 usage $0 "Invalid argument '$mode'" 811 fi 812fi 813 814if [[ -e /proc/$sync_pid/status ]]; then 815 wait "$sync_pid" 816fi 817