1#!/usr/bin/env bash 2 3set -e 4 5os=$(uname -s) 6 7if [[ $os != Linux && $os != FreeBSD ]]; then 8 echo "Not supported platform ($os), aborting" 9 exit 1 10fi 11 12rootdir=$(readlink -f $(dirname $0))/.. 13source "$rootdir/scripts/common.sh" 14 15function usage() { 16 if [[ $os == Linux ]]; then 17 options="[config|reset|status|cleanup|help]" 18 else 19 options="[config|reset|help]" 20 fi 21 22 [[ -n $2 ]] && ( 23 echo "$2" 24 echo "" 25 ) 26 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 27 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 28 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 29 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 30 echo "Usage: $(basename $1) $options" 31 echo 32 echo "$options - as following:" 33 echo "config Default mode. Allocate hugepages and bind PCI devices." 34 if [[ $os == Linux ]]; then 35 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 36 fi 37 echo "reset Rebind PCI devices back to their original drivers." 38 echo " Also cleanup any leftover spdk files/resources." 39 echo " Hugepage memory size will remain unchanged." 40 if [[ $os == Linux ]]; then 41 echo "status Print status of all SPDK-compatible devices on the system." 42 fi 43 echo "help Print this help message." 44 echo 45 echo "The following environment variables can be specified." 46 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 47 echo " For NUMA systems, the hugepages will be distributed on node0 by" 48 echo " default." 49 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 50 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 51 echo " Uses kernel's default for hugepages size." 52 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 53 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 54 echo " separated with comas. By default, NRHUGE will be applied on each node." 55 echo " Hugepages can be defined per node with e.g.:" 56 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 57 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 58 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 59 echo " setting is used." 60 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 61 echo " be made prior to allocation". 62 echo "PCI_ALLOWED" 63 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 64 echo " Each device must be specified as a full PCI address." 65 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 66 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 67 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 68 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 69 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 70 echo " will be bound." 71 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 72 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 73 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 74 echo " By default the current user will be used." 75 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 76 echo " bind devices to the given driver." 77 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 78 echo "PCI_BLOCK_SYNC_ON_RESET" 79 echo " If set in the environment, the attempt to wait for block devices associated" 80 echo " with given PCI device will be made upon reset" 81 exit 0 82} 83 84# In monolithic kernels the lsmod won't work. So 85# back that with a /sys/modules. We also check 86# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 87# contain needed info (like in Fedora-like OS). 88function check_for_driver() { 89 if lsmod | grep -q ${1//-/_}; then 90 return 1 91 fi 92 93 if [[ -d /sys/module/${1} || -d \ 94 /sys/module/${1//-/_} || -d \ 95 /sys/bus/pci/drivers/${1} || -d \ 96 /sys/bus/pci/drivers/${1//-/_} ]]; then 97 return 2 98 fi 99 return 0 100} 101 102function check_for_driver_freebsd() { 103 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 104 local search_paths path driver 105 IFS=";" read -ra search_paths < <(kldconfig -rU) 106 107 for driver in contigmem.ko nic_uio.ko; do 108 for path in "${search_paths[@]}"; do 109 [[ -f $path/$driver ]] && continue 2 110 done 111 return 1 112 done 113 return 0 114} 115 116function pci_dev_echo() { 117 local bdf="$1" 118 shift 119 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 120} 121 122function linux_bind_driver() { 123 bdf="$1" 124 driver_name="$2" 125 old_driver_name=${drivers_d["$bdf"]:-no driver} 126 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 127 128 if [[ $driver_name == "$old_driver_name" ]]; then 129 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 130 return 0 131 fi 132 133 if [[ $old_driver_name != "no driver" ]]; then 134 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 135 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 136 fi 137 138 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 139 140 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 141 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 142 143 if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then 144 # Check if the uio_pci_generic driver is broken as it might be in 145 # some 4.18.x kernels (see centos8 for instance) - if our device 146 # didn't get a proper uio entry, fallback to igb_uio 147 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 148 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 149 drivers_d["$bdf"]="no driver" 150 # This call will override $driver_name for remaining devices as well 151 linux_bind_driver "$bdf" igb_uio 152 fi 153 fi 154 155 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 156 if [ -e "/dev/vfio/$iommu_group" ]; then 157 if [ -n "$TARGET_USER" ]; then 158 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 159 fi 160 fi 161} 162 163function linux_unbind_driver() { 164 local bdf="$1" 165 local ven_dev_id 166 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 167 local old_driver_name=${drivers_d["$bdf"]:-no driver} 168 169 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 170 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 171 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 172 fi 173 174 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 175} 176 177function linux_hugetlbfs_mounts() { 178 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 179} 180 181function get_block_dev_from_bdf() { 182 local bdf=$1 183 local block 184 185 for block in /sys/block/*; do 186 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 187 echo "${block##*/}" 188 fi 189 done 190} 191 192function get_mounted_part_dev_from_bdf_block() { 193 local bdf=$1 194 local blocks block dev mount 195 196 hash lsblk || return 1 197 blocks=($(get_block_dev_from_bdf "$bdf")) 198 199 for block in "${blocks[@]}"; do 200 while read -r dev mount; do 201 if [[ -e $mount ]]; then 202 echo "$block:$dev" 203 fi 204 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 205 done 206} 207 208function collect_devices() { 209 # NVMe, IOAT, IDXD, VIRTIO, VMD 210 211 local ids dev_type dev_id bdf bdfs in_use driver 212 213 ids+="PCI_DEVICE_ID_INTEL_IOAT" 214 ids+="|PCI_DEVICE_ID_INTEL_IDXD" 215 ids+="|PCI_DEVICE_ID_VIRTIO" 216 ids+="|PCI_DEVICE_ID_INTEL_VMD" 217 ids+="|SPDK_PCI_CLASS_NVME" 218 219 local -gA nvme_d ioat_d idxd_d virtio_d vmd_d all_devices_d drivers_d 220 221 while read -r _ dev_type dev_id; do 222 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 223 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 224 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 225 [[ $dev_type =~ (NVME|IOAT|IDXD|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 226 for bdf in "${bdfs[@]}"; do 227 in_use=0 228 if [[ $1 != status ]]; then 229 if ! pci_can_use "$bdf"; then 230 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 231 in_use=1 232 fi 233 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 234 if ! verify_bdf_mounts "$bdf"; then 235 in_use=1 236 fi 237 fi 238 if [[ $dev_type == vmd ]]; then 239 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 240 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 241 in_use=1 242 fi 243 fi 244 fi 245 eval "${dev_type}_d[$bdf]=$in_use" 246 all_devices_d["$bdf"]=$in_use 247 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 248 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 249 drivers_d["$bdf"]=${driver##*/} 250 fi 251 done 252 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 253} 254 255function collect_driver() { 256 local bdf=$1 257 local drivers driver 258 259 [[ -e /sys/bus/pci/devices/$bdf/modalias ]] || return 1 260 if drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 261 # Pick first entry in case multiple aliases are bound to a driver. 262 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 263 driver=${driver##*/} 264 else 265 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 266 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 267 [[ -n ${idxd_d["$bdf"]} ]] && driver=idxd 268 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 269 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 270 fi 2> /dev/null 271 echo "$driver" 272} 273 274function verify_bdf_mounts() { 275 local bdf=$1 276 local blknames 277 blknames=($(get_mounted_part_dev_from_bdf_block "$bdf")) || return 1 278 279 if ((${#blknames[@]} > 0)); then 280 local IFS="," 281 pci_dev_echo "$bdf" "Active mountpoints on ${blknames[*]}, so not binding PCI dev" 282 return 1 283 fi 284} 285 286function configure_linux_pci() { 287 local driver_path="" 288 driver_name="" 289 igb_uio_fallback="" 290 291 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 292 # igb_uio is a common driver to override with and it depends on uio. 293 modprobe uio || true 294 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 295 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 296 fi 297 fi 298 299 if [[ -n "${DRIVER_OVERRIDE}" ]]; then 300 driver_path="$DRIVER_OVERRIDE" 301 driver_name="${DRIVER_OVERRIDE##*/}" 302 # modprobe and the sysfs don't use the .ko suffix. 303 driver_name=${driver_name%.ko} 304 # path = name -> there is no path 305 if [[ "$driver_path" = "$driver_name" ]]; then 306 driver_path="" 307 fi 308 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 309 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 310 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 311 driver_name=vfio-pci 312 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 313 # should be done automatically by modprobe since this particular module should 314 # be a part of vfio-pci dependencies, however, on some distros, it seems that 315 # it's not the case. See #1689. 316 if modinfo vfio_iommu_type1 > /dev/null; then 317 modprobe vfio_iommu_type1 318 fi 319 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 320 driver_name=uio_pci_generic 321 elif [[ -e $igb_uio_fallback ]]; then 322 driver_path="$igb_uio_fallback" 323 driver_name="igb_uio" 324 echo "WARNING: uio_pci_generic not detected - using $driver_name" 325 else 326 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 327 return 1 328 fi 329 330 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 331 if [[ -n "$driver_path" ]]; then 332 insmod $driver_path || true 333 else 334 modprobe $driver_name 335 fi 336 337 for bdf in "${!all_devices_d[@]}"; do 338 if ((all_devices_d["$bdf"] == 0)); then 339 if [[ -n ${nvme_d["$bdf"]} ]]; then 340 # Some nvme controllers may take significant amount of time while being 341 # unbound from the driver. Put that task into background to speed up the 342 # whole process. Currently this is done only for the devices bound to the 343 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 344 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 345 linux_bind_driver "$bdf" "$driver_name" & 346 else 347 linux_bind_driver "$bdf" "$driver_name" 348 fi 349 fi 350 done 351 wait 352 353 echo "1" > "/sys/bus/pci/rescan" 354} 355 356function cleanup_linux() { 357 shopt -s extglob nullglob 358 dirs_to_clean="" 359 dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) " 360 if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then 361 dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) " 362 fi 363 364 files_to_clean="" file_locks=() 365 for dir in $dirs_to_clean; do 366 files_to_clean+="$(echo $dir/*) " 367 done 368 file_locks+=(/var/tmp/spdk_pci_lock*) 369 shopt -u extglob nullglob 370 371 files_to_clean+="$(ls -1 /dev/shm/* \ 372 | grep -E '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz)_trace|spdk_iscsi_conns' || true) " 373 files_to_clean+=" ${file_locks[*]}" 374 files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)" 375 if [[ -z "$files_to_clean" ]]; then 376 echo "Clean" 377 return 0 378 fi 379 380 shopt -s extglob 381 for fd_dir in $(echo /proc/+([0-9])); do 382 opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)" 383 done 384 shopt -u extglob 385 386 if [[ -z "$opened_files" ]]; then 387 echo "Can't get list of opened files!" 388 exit 1 389 fi 390 391 echo 'Cleaning' 392 for f in $files_to_clean; do 393 if ! echo "$opened_files" | grep -E -q "^$f\$"; then 394 echo "Removing: $f" 395 rm $f 396 else 397 echo "Still open: $f" 398 fi 399 done 400 401 for dir in $dirs_to_clean; do 402 if ! echo "$opened_files" | grep -E -q "^$dir\$"; then 403 echo "Removing: $dir" 404 rmdir $dir 405 else 406 echo "Still open: $dir" 407 fi 408 done 409 echo "Clean" 410 411 unset dirs_to_clean files_to_clean opened_files 412} 413 414check_hugepages_alloc() { 415 local hp_int=$1 416 local allocated_hugepages 417 418 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 419 420 allocated_hugepages=$(< "$hp_int") 421 if ((allocated_hugepages < NRHUGE)); then 422 cat <<- ERROR 423 424 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 425 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 426 ERROR 427 return 1 428 fi 429} 430 431clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 432 433configure_linux_hugepages() { 434 local node system_nodes 435 local nodes_to_use nodes_hp 436 437 if [[ $CLEAR_HUGE == yes ]]; then 438 clear_hugepages 439 fi 440 441 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 442 clear_hugepages 443 check_hugepages_alloc /proc/sys/vm/nr_hugepages 444 return 0 445 fi 446 447 for node in /sys/devices/system/node/node*; do 448 [[ -e $node ]] || continue 449 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 450 done 451 452 if ((${#nodes[@]} == 0)); then 453 # No NUMA support? Fallback to common interface 454 check_hugepages_alloc /proc/sys/vm/nr_hugepages 455 return 0 456 fi 457 458 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 459 if ((${#nodes_to_use[@]} == 0)); then 460 nodes_to_use[0]=0 461 fi 462 463 # Align indexes with node ids 464 for node in "${!nodes_to_use[@]}"; do 465 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 466 eval "${nodes_to_use[node]}" 467 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 468 nodes_hp[nodes_to_use[node]]=$NRHUGE 469 fi 470 done 471 472 for node in "${!nodes_hp[@]}"; do 473 if [[ -z ${nodes[node]} ]]; then 474 echo "Node $node doesn't exist, ignoring" >&2 475 continue 476 fi 477 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 478 done 479} 480 481function configure_linux() { 482 configure_linux_pci 483 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 484 485 if [ -z "$hugetlbfs_mounts" ]; then 486 hugetlbfs_mounts=/mnt/huge 487 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 488 mkdir -p "$hugetlbfs_mounts" 489 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 490 fi 491 492 configure_linux_hugepages 493 494 if [ "$driver_name" = "vfio-pci" ]; then 495 if [ -n "$TARGET_USER" ]; then 496 for mount in $hugetlbfs_mounts; do 497 chown "$TARGET_USER" "$mount" 498 chmod g+w "$mount" 499 done 500 501 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 502 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 503 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 504 cat <<- MEMLOCK 505 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 506 507 This is the maximum amount of memory you will be 508 able to use with DPDK and VFIO if run as user "$TARGET_USER". 509 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 510 MEMLOCK 511 if ((MEMLOCK_AMNT < 65536)); then 512 echo "" 513 echo "## WARNING: memlock limit is less than 64MB" 514 echo -n "## DPDK with VFIO may not be able to initialize " 515 echo "if run as user \"$TARGET_USER\"." 516 fi 517 fi 518 fi 519 fi 520 521 if [ ! -e /dev/cpu/0/msr ]; then 522 # Some distros build msr as a module. Make sure it's loaded to ensure 523 # DPDK can easily figure out the TSC rate rather than relying on 100ms 524 # sleeps. 525 modprobe msr || true 526 fi 527} 528 529function reset_linux_pci() { 530 # virtio 531 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 532 # Requires some more investigation - for example, some kernels do not seem to have 533 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 534 # underscore vs. dash right in the virtio_scsi name. 535 modprobe virtio-pci || true 536 for bdf in "${!all_devices_d[@]}"; do 537 ((all_devices_d["$bdf"] == 0)) || continue 538 539 driver=$(collect_driver "$bdf") 540 if ! check_for_driver "$driver"; then 541 linux_bind_driver "$bdf" "$driver" 542 else 543 linux_unbind_driver "$bdf" 544 fi 545 done 546 547 echo "1" > "/sys/bus/pci/rescan" 548} 549 550function reset_linux() { 551 reset_linux_pci 552 for mount in $(linux_hugetlbfs_mounts); do 553 for hp in "$mount"/spdk*map_*; do 554 flock -n "$hp" true && rm -f "$hp" 555 done 556 done 557 rm -f /run/.spdk* 558} 559 560function status_linux() { 561 echo "Hugepages" >&2 562 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 563 564 numa_nodes=0 565 shopt -s nullglob 566 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 567 numa_nodes=$((numa_nodes + 1)) 568 free_pages=$(cat $path/free_hugepages) 569 all_pages=$(cat $path/nr_hugepages) 570 571 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 572 573 node=${BASH_REMATCH[1]} 574 huge_size=${BASH_REMATCH[2]} 575 576 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 577 done 578 shopt -u nullglob 579 580 # fall back to system-wide hugepages 581 if [ "$numa_nodes" = "0" ]; then 582 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 583 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 584 node="-" 585 huge_size="$HUGEPGSZ" 586 587 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 588 fi 589 590 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 591 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 592 593 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 594 595 for bdf in "${sorted_bdfs[@]}"; do 596 driver=${drivers_d["$bdf"]} 597 if [ "$numa_nodes" = "0" ]; then 598 node="-" 599 else 600 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 601 if ((node == -1)); then 602 node=unknown 603 fi 604 fi 605 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 606 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 607 else 608 name="-" 609 fi 610 611 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 612 blknames=($(get_block_dev_from_bdf "$bdf")) 613 else 614 blknames=("-") 615 fi 616 617 desc="" 618 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 619 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 620 desc=${desc:-${idxd_d["$bdf"]:+IDXD}} 621 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 622 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 623 624 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 625 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 626 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 627 done 628} 629 630function status_freebsd() { 631 local pci 632 633 status_print() ( 634 local dev driver 635 636 echo -e "BDF\t\tVendor\tDevice\tDriver" 637 638 for pci; do 639 driver=$(pciconf -l "pci$pci") 640 driver=${driver%@*} 641 printf '%s\t%s\t%s\t%s\n' \ 642 "$pci" \ 643 "${pci_ids_vendor["$pci"]}" \ 644 "${pci_ids_device["$pci"]}" \ 645 "$driver" 646 done | sort -k1,1 647 ) 648 649 local contigmem=present 650 local contigmem_buffer_size 651 local contigmem_num_buffers 652 653 if ! kldstat -q -m contigmem; then 654 contigmem="not present" 655 fi 656 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 657 contigmem_buffer_size="not set" 658 fi 659 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 660 contigmem_num_buffers="not set" 661 fi 662 663 cat <<- BSD_INFO 664 Contigmem ($contigmem) 665 Buffer Size: $contigmem_buffer_size 666 Num Buffers: $contigmem_num_buffers 667 668 NVMe devices 669 $(status_print "${!nvme_d[@]}") 670 671 I/IOAT DMA 672 $(status_print "${!ioat_d[@]}") 673 674 IDXD DMA 675 $(status_print "${!idxd_d[@]}") 676 677 VMD 678 $(status_print "${!vmd_d[@]}") 679 BSD_INFO 680} 681 682function configure_freebsd_pci() { 683 local BDFS 684 685 BDFS+=("${!nvme_d[@]}") 686 BDFS+=("${!ioat_d[@]}") 687 BDFS+=("${!idxd_d[@]}") 688 BDFS+=("${!vmd_d[@]}") 689 690 # Drop the domain part from all the addresses 691 BDFS=("${BDFS[@]#*:}") 692 693 local IFS="," 694 kldunload nic_uio.ko || true 695 kenv hw.nic_uio.bdfs="${BDFS[*]}" 696 kldload nic_uio.ko 697} 698 699function configure_freebsd() { 700 if ! check_for_driver_freebsd; then 701 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 702 return 1 703 fi 704 configure_freebsd_pci 705 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 706 # previous value, unload contigmem so that we can reload with the new value. 707 if kldstat -q -m contigmem; then 708 # contigmem may be loaded, but the kernel environment doesn't have to 709 # be necessarily set at this point. If it isn't, kenv will fail to 710 # pick up the hw. options. Handle it. 711 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 712 contigmem_num_buffers=-1 713 fi 2> /dev/null 714 if ((contigmem_num_buffers != HUGEMEM / 256)); then 715 kldunload contigmem.ko 716 fi 717 fi 718 if ! kldstat -q -m contigmem; then 719 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 720 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 721 kldload contigmem.ko 722 fi 723} 724 725function reset_freebsd() { 726 kldunload contigmem.ko || true 727 kldunload nic_uio.ko || true 728} 729 730CMD=reset cache_pci_bus 731 732mode=$1 733 734if [ -z "$mode" ]; then 735 mode="config" 736fi 737 738: ${HUGEMEM:=2048} 739: ${PCI_ALLOWED:=""} 740: ${PCI_BLOCKED:=""} 741 742if [ -n "$NVME_ALLOWED" ]; then 743 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 744fi 745 746if [ -n "$SKIP_PCI" ]; then 747 PCI_ALLOWED="none" 748fi 749 750if [ -z "$TARGET_USER" ]; then 751 TARGET_USER="$SUDO_USER" 752 if [ -z "$TARGET_USER" ]; then 753 TARGET_USER=$(logname 2> /dev/null) || true 754 fi 755fi 756 757collect_devices "$mode" 758 759if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 760 # Note that this will wait only for the first block device attached to 761 # a given storage controller. For nvme this may miss some of the devs 762 # in case multiple namespaces are being in place. 763 # FIXME: Wait for nvme controller(s) to be in live state and determine 764 # number of configured namespaces, build list of potential block devs 765 # and pass them to sync_dev_uevents. Is it worth the effort? 766 bdfs_to_wait_for=() 767 for bdf in "${!all_devices_d[@]}"; do 768 ((all_devices_d["$bdf"] == 0)) || continue 769 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 770 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 771 bdfs_to_wait_for+=("$bdf") 772 fi 773 done 774 if ((${#bdfs_to_wait_for[@]} > 0)); then 775 echo "Waiting for block devices as requested" 776 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 777 "$rootdir/scripts/sync_dev_uevents.sh" \ 778 block/disk \ 779 "${bdfs_to_wait_for[@]}" & 780 sync_pid=$! 781 fi 782fi 783 784if [[ $os == Linux ]]; then 785 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 786 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 787 unset -v HUGEPGSZ 788 fi 789 790 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 791 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 792 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 793 794 if [ "$mode" == "config" ]; then 795 configure_linux 796 elif [ "$mode" == "cleanup" ]; then 797 cleanup_linux 798 clear_hugepages 799 elif [ "$mode" == "reset" ]; then 800 reset_linux 801 elif [ "$mode" == "status" ]; then 802 status_linux 803 elif [ "$mode" == "help" ]; then 804 usage $0 805 else 806 usage $0 "Invalid argument '$mode'" 807 fi 808else 809 if [ "$mode" == "config" ]; then 810 configure_freebsd 811 elif [ "$mode" == "reset" ]; then 812 reset_freebsd 813 elif [ "$mode" == "cleanup" ]; then 814 echo "setup.sh cleanup function not yet supported on $os" 815 elif [ "$mode" == "status" ]; then 816 status_freebsd 817 elif [ "$mode" == "help" ]; then 818 usage $0 819 else 820 usage $0 "Invalid argument '$mode'" 821 fi 822fi 823 824if [[ -e /proc/$sync_pid/status ]]; then 825 wait "$sync_pid" 826fi 827