1 #!/usr/bin/env bash 2 3 set -e 4 shopt -s nullglob extglob 5 6 os=$(uname -s) 7 8 if [[ $os != Linux && $os != FreeBSD ]]; then 9 echo "Not supported platform ($os), aborting" 10 exit 1 11 fi 12 13 rootdir=$(readlink -f $(dirname $0))/.. 14 source "$rootdir/scripts/common.sh" 15 16 function usage() { 17 if [[ $os == Linux ]]; then 18 options="[config|reset|status|cleanup|help]" 19 else 20 options="[config|reset|help]" 21 fi 22 23 [[ -n $2 ]] && ( 24 echo "$2" 25 echo "" 26 ) 27 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 28 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 29 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 30 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 31 echo "Usage: $(basename $1) $options" 32 echo 33 echo "$options - as following:" 34 echo "config Default mode. Allocate hugepages and bind PCI devices." 35 if [[ $os == Linux ]]; then 36 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 37 fi 38 echo "reset Rebind PCI devices back to their original drivers." 39 echo " Also cleanup any leftover spdk files/resources." 40 echo " Hugepage memory size will remain unchanged." 41 if [[ $os == Linux ]]; then 42 echo "status Print status of all SPDK-compatible devices on the system." 43 fi 44 echo "help Print this help message." 45 echo 46 echo "The following environment variables can be specified." 47 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 48 echo " For NUMA systems, the hugepages will be distributed on node0 by" 49 echo " default." 50 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 51 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 52 echo " Uses kernel's default for hugepages size." 53 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 54 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 55 echo " separated with comas. By default, NRHUGE will be applied on each node." 56 echo " Hugepages can be defined per node with e.g.:" 57 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 58 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 59 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 60 echo " setting is used." 61 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 62 echo " number of requested hugepages is lower from what's already" 63 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 64 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 65 echo " be made prior to allocation". 66 echo "PCI_ALLOWED" 67 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 68 echo " Each device must be specified as a full PCI address." 69 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 70 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 71 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 72 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 73 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 74 echo " will be bound." 75 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 76 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 77 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 78 echo " By default the current user will be used." 79 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 80 echo " bind devices to the given driver." 81 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 82 echo "PCI_BLOCK_SYNC_ON_RESET" 83 echo " If set in the environment, the attempt to wait for block devices associated" 84 echo " with given PCI device will be made upon reset" 85 exit 0 86 } 87 88 # In monolithic kernels the lsmod won't work. So 89 # back that with a /sys/modules. We also check 90 # /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 91 # contain needed info (like in Fedora-like OS). 92 function check_for_driver() { 93 if [[ -z $1 ]]; then 94 return 0 95 fi 96 97 if lsmod | grep -q ${1//-/_}; then 98 return 1 99 fi 100 101 if [[ -d /sys/module/${1} || -d \ 102 /sys/module/${1//-/_} || -d \ 103 /sys/bus/pci/drivers/${1} || -d \ 104 /sys/bus/pci/drivers/${1//-/_} ]]; then 105 return 2 106 fi 107 return 0 108 } 109 110 function check_for_driver_freebsd() { 111 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 112 local search_paths path driver 113 IFS=";" read -ra search_paths < <(kldconfig -rU) 114 115 for driver in contigmem.ko nic_uio.ko; do 116 for path in "${search_paths[@]}"; do 117 [[ -f $path/$driver ]] && continue 2 118 done 119 return 1 120 done 121 return 0 122 } 123 124 function pci_dev_echo() { 125 local bdf="$1" 126 shift 127 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 128 } 129 130 function linux_bind_driver() { 131 bdf="$1" 132 driver_name="$2" 133 old_driver_name=${drivers_d["$bdf"]:-no driver} 134 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 135 136 if [[ $driver_name == "$old_driver_name" ]]; then 137 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 138 return 0 139 fi 140 141 if [[ $old_driver_name != "no driver" ]]; then 142 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 143 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 144 fi 145 146 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 147 148 if [[ $driver_name == "none" ]]; then 149 return 0 150 fi 151 152 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 153 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 154 155 if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then 156 # Check if the uio_pci_generic driver is broken as it might be in 157 # some 4.18.x kernels (see centos8 for instance) - if our device 158 # didn't get a proper uio entry, fallback to igb_uio 159 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 160 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 161 drivers_d["$bdf"]="no driver" 162 # This call will override $driver_name for remaining devices as well 163 linux_bind_driver "$bdf" igb_uio 164 fi 165 fi 166 167 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 168 if [ -e "/dev/vfio/$iommu_group" ]; then 169 if [ -n "$TARGET_USER" ]; then 170 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 171 fi 172 fi 173 } 174 175 function linux_unbind_driver() { 176 local bdf="$1" 177 local ven_dev_id 178 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 179 local old_driver_name=${drivers_d["$bdf"]:-no driver} 180 181 if [[ $old_driver_name == "no driver" ]]; then 182 pci_dev_echo "$bdf" "Not bound to any driver" 183 return 0 184 fi 185 186 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 187 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 188 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 189 fi 190 191 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 192 } 193 194 function linux_hugetlbfs_mounts() { 195 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 196 } 197 198 function get_block_dev_from_bdf() { 199 local bdf=$1 200 local block 201 202 for block in /sys/block/*; do 203 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 204 echo "${block##*/}" 205 fi 206 done 207 } 208 209 function get_used_bdf_block_devs() { 210 local bdf=$1 211 local blocks block blockp dev mount holder 212 local used 213 214 hash lsblk &> /dev/null || return 1 215 blocks=($(get_block_dev_from_bdf "$bdf")) 216 217 for block in "${blocks[@]}"; do 218 # Check if the device is hold by some other, regardless if it's mounted 219 # or not. 220 for holder in "/sys/class/block/$block"*/holders/*; do 221 [[ -e $holder ]] || continue 222 blockp=${holder%/holders*} blockp=${blockp##*/} 223 if [[ -e $holder/slaves/$blockp ]]; then 224 used+=("holder@$blockp:${holder##*/}") 225 fi 226 done 227 while read -r dev mount; do 228 if [[ -e $mount ]]; then 229 used+=("mount@$block:$dev") 230 fi 231 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 232 if ((${#used[@]} == 0)); then 233 # Make sure we check if there's any valid data present on the target device 234 # regardless if it's being actively used or not. This is mainly done to make 235 # sure we don't miss more complex setups like ZFS pools, etc. 236 if block_in_use "$block" > /dev/null; then 237 used+=("data@$block") 238 fi 239 fi 240 done 241 242 if ((${#used[@]} > 0)); then 243 printf '%s\n' "${used[@]}" 244 fi 245 } 246 247 function collect_devices() { 248 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 249 250 local ids dev_type dev_id bdf bdfs in_use driver 251 252 ids+="PCI_DEVICE_ID_INTEL_IOAT" 253 ids+="|PCI_DEVICE_ID_INTEL_DSA" 254 ids+="|PCI_DEVICE_ID_INTEL_IAA" 255 ids+="|PCI_DEVICE_ID_VIRTIO" 256 ids+="|PCI_DEVICE_ID_INTEL_VMD" 257 ids+="|SPDK_PCI_CLASS_NVME" 258 259 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d 260 261 while read -r _ dev_type dev_id; do 262 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 263 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 264 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 265 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 266 for bdf in "${bdfs[@]}"; do 267 in_use=0 268 if [[ $1 != status ]]; then 269 if ! pci_can_use "$bdf"; then 270 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 271 in_use=1 272 fi 273 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 274 if ! verify_bdf_block_devs "$bdf"; then 275 in_use=1 276 fi 277 fi 278 if [[ $dev_type == vmd ]]; then 279 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 280 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 281 in_use=1 282 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 283 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 284 if [ "$mode" == "config" ]; then 285 cat <<- MESSAGE 286 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 287 which are attached to the kernel NVMe driver,the binding process may go faster 288 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 289 NVMe SSDs, and then run again to unbind the VMD devices." 290 MESSAGE 291 fi 292 fi 293 fi 294 fi 295 fi 296 eval "${dev_type}_d[$bdf]=$in_use" 297 all_devices_d["$bdf"]=$in_use 298 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 299 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 300 drivers_d["$bdf"]=${driver##*/} 301 fi 302 done 303 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 304 } 305 306 function collect_driver() { 307 local bdf=$1 308 local drivers driver 309 310 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 311 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 312 # Pick first entry in case multiple aliases are bound to a driver. 313 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 314 driver=${driver##*/} 315 else 316 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 317 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 318 [[ -n ${dsa_d["$bdf"]} ]] && driver=dsa 319 [[ -n ${iaa_d["$bdf"]} ]] && driver=iaa 320 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 321 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 322 fi 2> /dev/null 323 echo "$driver" 324 } 325 326 function verify_bdf_block_devs() { 327 local bdf=$1 328 local blknames 329 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 330 331 if ((${#blknames[@]} > 0)); then 332 local IFS="," 333 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 334 return 1 335 fi 336 } 337 338 function configure_linux_pci() { 339 local driver_path="" 340 driver_name="" 341 igb_uio_fallback="" 342 343 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 344 # igb_uio is a common driver to override with and it depends on uio. 345 modprobe uio || true 346 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 347 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 348 fi 349 fi 350 351 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 352 driver_name=none 353 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 354 driver_path="$DRIVER_OVERRIDE" 355 driver_name="${DRIVER_OVERRIDE##*/}" 356 # modprobe and the sysfs don't use the .ko suffix. 357 driver_name=${driver_name%.ko} 358 # path = name -> there is no path 359 if [[ "$driver_path" = "$driver_name" ]]; then 360 driver_path="" 361 fi 362 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 363 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 364 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 365 driver_name=vfio-pci 366 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 367 # should be done automatically by modprobe since this particular module should 368 # be a part of vfio-pci dependencies, however, on some distros, it seems that 369 # it's not the case. See #1689. 370 if modinfo vfio_iommu_type1 > /dev/null; then 371 modprobe vfio_iommu_type1 372 fi 373 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 374 driver_name=uio_pci_generic 375 elif [[ -e $igb_uio_fallback ]]; then 376 driver_path="$igb_uio_fallback" 377 driver_name="igb_uio" 378 echo "WARNING: uio_pci_generic not detected - using $driver_name" 379 else 380 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 381 return 1 382 fi 383 384 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 385 if [[ $driver_name != "none" ]]; then 386 if [[ -n "$driver_path" ]]; then 387 insmod $driver_path || true 388 else 389 modprobe $driver_name 390 fi 391 fi 392 393 for bdf in "${!all_devices_d[@]}"; do 394 if ((all_devices_d["$bdf"] == 0)); then 395 if [[ -n ${nvme_d["$bdf"]} ]]; then 396 # Some nvme controllers may take significant amount of time while being 397 # unbound from the driver. Put that task into background to speed up the 398 # whole process. Currently this is done only for the devices bound to the 399 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 400 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 401 linux_bind_driver "$bdf" "$driver_name" & 402 else 403 linux_bind_driver "$bdf" "$driver_name" 404 fi 405 fi 406 done 407 wait 408 409 echo "1" > "/sys/bus/pci/rescan" 410 } 411 412 function cleanup_linux() { 413 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 414 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 415 416 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 417 if [[ -d $XDG_RUNTIME_DIR ]]; then 418 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 419 fi 420 421 for dir in "${dirs_to_clean[@]}"; do 422 files_to_clean+=("$dir/"*) 423 done 424 file_locks+=(/var/tmp/spdk_pci_lock*) 425 file_locks+=(/var/tmp/spdk_cpu_lock*) 426 427 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)*) 428 files_to_clean+=("${file_locks[@]}") 429 430 # This may fail in case path that readlink attempts to resolve suddenly 431 # disappears (as it may happen with terminating processes). 432 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 433 434 if ((${#opened_files[@]} == 0)); then 435 echo "Can't get list of opened files!" 436 exit 1 437 fi 438 439 echo 'Cleaning' 440 for f in "${files_to_clean[@]}"; do 441 [[ -e $f ]] || continue 442 if [[ ${opened_files[*]} != *"$f"* ]]; then 443 echo "Removing: $f" 444 rm $f 445 else 446 echo "Still open: $f" 447 fi 448 done 449 450 for dir in "${dirs_to_clean[@]}"; do 451 [[ -d $dir ]] || continue 452 if [[ ${opened_files[*]} != *"$dir"* ]]; then 453 echo "Removing: $dir" 454 rmdir $dir 455 else 456 echo "Still open: $dir" 457 fi 458 done 459 echo "Clean" 460 } 461 462 check_hugepages_alloc() { 463 local hp_int=$1 464 local allocated_hugepages 465 466 allocated_hugepages=$(< "$hp_int") 467 468 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 469 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 470 return 0 471 fi 472 473 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 474 475 allocated_hugepages=$(< "$hp_int") 476 if ((allocated_hugepages < NRHUGE)); then 477 cat <<- ERROR 478 479 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 480 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 481 ERROR 482 return 1 483 fi 484 } 485 486 clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 487 488 configure_linux_hugepages() { 489 local node system_nodes 490 local nodes_to_use nodes_hp 491 492 if [[ $CLEAR_HUGE == yes ]]; then 493 clear_hugepages 494 fi 495 496 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 497 clear_hugepages 498 check_hugepages_alloc /proc/sys/vm/nr_hugepages 499 return 0 500 fi 501 502 for node in /sys/devices/system/node/node*; do 503 [[ -e $node ]] || continue 504 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 505 done 506 507 if ((${#nodes[@]} == 0)); then 508 # No NUMA support? Fallback to common interface 509 check_hugepages_alloc /proc/sys/vm/nr_hugepages 510 return 0 511 fi 512 513 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 514 if ((${#nodes_to_use[@]} == 0)); then 515 nodes_to_use[0]=0 516 fi 517 518 # Align indexes with node ids 519 for node in "${!nodes_to_use[@]}"; do 520 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 521 eval "${nodes_to_use[node]}" 522 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 523 nodes_hp[nodes_to_use[node]]=$NRHUGE 524 fi 525 done 526 527 for node in "${!nodes_hp[@]}"; do 528 if [[ -z ${nodes[node]} ]]; then 529 echo "Node $node doesn't exist, ignoring" >&2 530 continue 531 fi 532 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 533 done 534 } 535 536 function configure_linux() { 537 configure_linux_pci 538 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 539 540 if [ -z "$hugetlbfs_mounts" ]; then 541 hugetlbfs_mounts=/mnt/huge 542 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 543 mkdir -p "$hugetlbfs_mounts" 544 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 545 fi 546 547 configure_linux_hugepages 548 549 if [ "$driver_name" = "vfio-pci" ]; then 550 if [ -n "$TARGET_USER" ]; then 551 for mount in $hugetlbfs_mounts; do 552 chown "$TARGET_USER" "$mount" 553 chmod g+w "$mount" 554 done 555 556 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 557 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 558 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 559 cat <<- MEMLOCK 560 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 561 562 This is the maximum amount of memory you will be 563 able to use with DPDK and VFIO if run as user "$TARGET_USER". 564 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 565 MEMLOCK 566 if ((MEMLOCK_AMNT < 65536)); then 567 echo "" 568 echo "## WARNING: memlock limit is less than 64MB" 569 echo -n "## DPDK with VFIO may not be able to initialize " 570 echo "if run as user \"$TARGET_USER\"." 571 fi 572 fi 573 fi 574 fi 575 576 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 577 # Some distros build msr as a module. Make sure it's loaded to ensure 578 # DPDK can easily figure out the TSC rate rather than relying on 100ms 579 # sleeps. 580 modprobe msr &> /dev/null || true 581 fi 582 } 583 584 function reset_linux_pci() { 585 # virtio 586 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 587 # Requires some more investigation - for example, some kernels do not seem to have 588 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 589 # underscore vs. dash right in the virtio_scsi name. 590 modprobe virtio-pci || true 591 for bdf in "${!all_devices_d[@]}"; do 592 ((all_devices_d["$bdf"] == 0)) || continue 593 594 driver=$(collect_driver "$bdf") 595 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 596 linux_bind_driver "$bdf" "$driver" 597 else 598 linux_unbind_driver "$bdf" 599 fi 600 done 601 602 echo "1" > "/sys/bus/pci/rescan" 603 } 604 605 function reset_linux() { 606 reset_linux_pci 607 for mount in $(linux_hugetlbfs_mounts); do 608 for hp in "$mount"/spdk*map_*; do 609 flock -n "$hp" true && rm -f "$hp" 610 done 611 done 612 rm -f /run/.spdk* 613 } 614 615 function status_linux() { 616 echo "Hugepages" >&2 617 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 618 619 numa_nodes=0 620 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 621 numa_nodes=$((numa_nodes + 1)) 622 free_pages=$(cat $path/free_hugepages) 623 all_pages=$(cat $path/nr_hugepages) 624 625 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 626 627 node=${BASH_REMATCH[1]} 628 huge_size=${BASH_REMATCH[2]} 629 630 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 631 done 632 633 # fall back to system-wide hugepages 634 if [ "$numa_nodes" = "0" ]; then 635 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 636 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 637 node="-" 638 huge_size="$HUGEPGSZ" 639 640 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 641 fi 642 643 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 644 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 645 646 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 647 648 for bdf in "${sorted_bdfs[@]}"; do 649 driver=${drivers_d["$bdf"]} 650 if [ "$numa_nodes" = "0" ]; then 651 node="-" 652 else 653 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 654 if ((node == -1)); then 655 node=unknown 656 fi 657 fi 658 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 659 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 660 else 661 name="-" 662 fi 663 664 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 665 blknames=($(get_block_dev_from_bdf "$bdf")) 666 else 667 blknames=("-") 668 fi 669 670 desc="" 671 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 672 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 673 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 674 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 675 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 676 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 677 678 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 679 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 680 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 681 done 682 } 683 684 function status_freebsd() { 685 local pci 686 687 status_print() ( 688 local type=$1 689 local dev driver 690 691 shift 692 693 for pci; do 694 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 695 "$type" \ 696 "$pci" \ 697 "${pci_ids_vendor["$pci"]}" \ 698 "${pci_ids_device["$pci"]}" \ 699 "${pci_bus_driver["$pci"]}" 700 done | sort -k2,2 701 ) 702 703 local contigmem=present 704 local contigmem_buffer_size 705 local contigmem_num_buffers 706 707 if ! kldstat -q -m contigmem; then 708 contigmem="not present" 709 fi 710 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 711 contigmem_buffer_size="not set" 712 fi 713 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 714 contigmem_num_buffers="not set" 715 fi 716 717 cat <<- BSD_INFO 718 Contigmem ($contigmem) 719 Buffer Size: $contigmem_buffer_size 720 Num Buffers: $contigmem_num_buffers 721 722 BSD_INFO 723 724 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 725 "Type" "BDF" "Vendor" "Device" "Driver" >&2 726 727 status_print "NVMe" "${!nvme_d[@]}" 728 status_print "I/OAT" "${!ioat_d[@]}" 729 status_print "DSA" "${!dsa_d[@]}" 730 status_print "IAA" "${!iaa_d[@]}" 731 status_print "VMD" "${!vmd_d[@]}" 732 } 733 734 function configure_freebsd_pci() { 735 local BDFS 736 737 BDFS+=("${!nvme_d[@]}") 738 BDFS+=("${!ioat_d[@]}") 739 BDFS+=("${!dsa_d[@]}") 740 BDFS+=("${!iaa_d[@]}") 741 BDFS+=("${!vmd_d[@]}") 742 743 # Drop the domain part from all the addresses 744 BDFS=("${BDFS[@]#*:}") 745 746 local IFS="," 747 kldunload nic_uio.ko || true 748 kenv hw.nic_uio.bdfs="${BDFS[*]}" 749 kldload nic_uio.ko 750 } 751 752 function configure_freebsd() { 753 if ! check_for_driver_freebsd; then 754 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 755 return 1 756 fi 757 configure_freebsd_pci 758 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 759 # previous value, unload contigmem so that we can reload with the new value. 760 if kldstat -q -m contigmem; then 761 # contigmem may be loaded, but the kernel environment doesn't have to 762 # be necessarily set at this point. If it isn't, kenv will fail to 763 # pick up the hw. options. Handle it. 764 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 765 contigmem_num_buffers=-1 766 fi 2> /dev/null 767 if ((contigmem_num_buffers != HUGEMEM / 256)); then 768 kldunload contigmem.ko 769 fi 770 fi 771 if ! kldstat -q -m contigmem; then 772 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 773 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 774 kldload contigmem.ko 775 fi 776 } 777 778 function reset_freebsd() { 779 kldunload contigmem.ko || true 780 kldunload nic_uio.ko || true 781 } 782 783 CMD=reset cache_pci_bus 784 785 mode=$1 786 787 if [ -z "$mode" ]; then 788 mode="config" 789 fi 790 791 : ${HUGEMEM:=2048} 792 : ${PCI_ALLOWED:=""} 793 : ${PCI_BLOCKED:=""} 794 795 if [ -n "$NVME_ALLOWED" ]; then 796 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 797 fi 798 799 if [ -n "$SKIP_PCI" ]; then 800 PCI_ALLOWED="none" 801 fi 802 803 if [ -z "$TARGET_USER" ]; then 804 TARGET_USER="$SUDO_USER" 805 if [ -z "$TARGET_USER" ]; then 806 TARGET_USER=$(logname 2> /dev/null) || true 807 fi 808 fi 809 810 collect_devices "$mode" 811 812 if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 813 # Note that this will wait only for the first block device attached to 814 # a given storage controller. For nvme this may miss some of the devs 815 # in case multiple namespaces are being in place. 816 # FIXME: Wait for nvme controller(s) to be in live state and determine 817 # number of configured namespaces, build list of potential block devs 818 # and pass them to sync_dev_uevents. Is it worth the effort? 819 bdfs_to_wait_for=() 820 for bdf in "${!all_devices_d[@]}"; do 821 ((all_devices_d["$bdf"] == 0)) || continue 822 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 823 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 824 bdfs_to_wait_for+=("$bdf") 825 fi 826 done 827 if ((${#bdfs_to_wait_for[@]} > 0)); then 828 echo "Waiting for block devices as requested" 829 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 830 "$rootdir/scripts/sync_dev_uevents.sh" \ 831 block/disk \ 832 "${bdfs_to_wait_for[@]}" & 833 sync_pid=$! 834 fi 835 fi 836 837 if [[ $os == Linux ]]; then 838 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 839 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 840 unset -v HUGEPGSZ 841 fi 842 843 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 844 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 845 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 846 847 if [ "$mode" == "config" ]; then 848 configure_linux 849 elif [ "$mode" == "cleanup" ]; then 850 cleanup_linux 851 clear_hugepages 852 elif [ "$mode" == "reset" ]; then 853 reset_linux 854 elif [ "$mode" == "status" ]; then 855 status_linux 856 elif [ "$mode" == "help" ]; then 857 usage $0 858 else 859 usage $0 "Invalid argument '$mode'" 860 fi 861 else 862 if [ "$mode" == "config" ]; then 863 configure_freebsd 864 elif [ "$mode" == "reset" ]; then 865 reset_freebsd 866 elif [ "$mode" == "cleanup" ]; then 867 echo "setup.sh cleanup function not yet supported on $os" 868 elif [ "$mode" == "status" ]; then 869 status_freebsd 870 elif [ "$mode" == "help" ]; then 871 usage $0 872 else 873 usage $0 "Invalid argument '$mode'" 874 fi 875 fi 876 877 if [[ -e /proc/$sync_pid/status ]]; then 878 wait "$sync_pid" 879 fi 880