1#!/usr/bin/env bash 2 3set -e 4shopt -s nullglob extglob 5 6os=$(uname -s) 7 8if [[ $os != Linux && $os != FreeBSD ]]; then 9 echo "Not supported platform ($os), aborting" 10 exit 1 11fi 12 13rootdir=$(readlink -f $(dirname $0))/.. 14source "$rootdir/scripts/common.sh" 15 16function usage() { 17 if [[ $os == Linux ]]; then 18 options="[config|reset|status|cleanup|help]" 19 else 20 options="[config|reset|help]" 21 fi 22 23 [[ -n $2 ]] && ( 24 echo "$2" 25 echo "" 26 ) 27 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 28 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 29 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 30 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 31 echo "Usage: $(basename $1) $options" 32 echo 33 echo "$options - as following:" 34 echo "config Default mode. Allocate hugepages and bind PCI devices." 35 if [[ $os == Linux ]]; then 36 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 37 fi 38 echo "reset Rebind PCI devices back to their original drivers." 39 echo " Also cleanup any leftover spdk files/resources." 40 echo " Hugepage memory size will remain unchanged." 41 if [[ $os == Linux ]]; then 42 echo "status Print status of all SPDK-compatible devices on the system." 43 fi 44 echo "help Print this help message." 45 echo 46 echo "The following environment variables can be specified." 47 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 48 echo " For NUMA systems, the hugepages will be distributed on node0 by" 49 echo " default." 50 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 51 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 52 echo " Uses kernel's default for hugepages size." 53 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 54 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 55 echo " separated with comas. By default, NRHUGE will be applied on each node." 56 echo " Hugepages can be defined per node with e.g.:" 57 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 58 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 59 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 60 echo " setting is used." 61 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 62 echo " number of requested hugepages is lower from what's already" 63 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 64 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 65 echo " be made prior to allocation". 66 echo "PCI_ALLOWED" 67 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 68 echo " Each device must be specified as a full PCI address." 69 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 70 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 71 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 72 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 73 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 74 echo " will be bound." 75 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 76 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 77 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 78 echo " By default the current user will be used." 79 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 80 echo " bind devices to the given driver." 81 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 82 echo "PCI_BLOCK_SYNC_ON_RESET" 83 echo " If set in the environment, the attempt to wait for block devices associated" 84 echo " with given PCI device will be made upon reset" 85 exit 0 86} 87 88# In monolithic kernels the lsmod won't work. So 89# back that with a /sys/modules. We also check 90# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 91# contain needed info (like in Fedora-like OS). 92function check_for_driver() { 93 if [[ -z $1 ]]; then 94 return 0 95 fi 96 97 if lsmod | grep -q ${1//-/_}; then 98 return 1 99 fi 100 101 if [[ -d /sys/module/${1} || -d \ 102 /sys/module/${1//-/_} || -d \ 103 /sys/bus/pci/drivers/${1} || -d \ 104 /sys/bus/pci/drivers/${1//-/_} ]]; then 105 return 2 106 fi 107 return 0 108} 109 110function check_for_driver_freebsd() { 111 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 112 local search_paths path driver 113 IFS=";" read -ra search_paths < <(kldconfig -rU) 114 115 for driver in contigmem.ko nic_uio.ko; do 116 for path in "${search_paths[@]}"; do 117 [[ -f $path/$driver ]] && continue 2 118 done 119 return 1 120 done 121 return 0 122} 123 124function pci_dev_echo() { 125 local bdf="$1" 126 shift 127 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 128} 129 130function linux_bind_driver() { 131 bdf="$1" 132 driver_name="$2" 133 old_driver_name=${drivers_d["$bdf"]:-no driver} 134 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 135 136 if [[ $driver_name == "$old_driver_name" ]]; then 137 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 138 return 0 139 fi 140 141 if [[ $old_driver_name != "no driver" ]]; then 142 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 143 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 144 fi 145 146 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 147 148 if [[ $driver_name == "none" ]]; then 149 return 0 150 fi 151 152 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 153 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 154 155 if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then 156 # Check if the uio_pci_generic driver is broken as it might be in 157 # some 4.18.x kernels (see centos8 for instance) - if our device 158 # didn't get a proper uio entry, fallback to igb_uio 159 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 160 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 161 drivers_d["$bdf"]="no driver" 162 # This call will override $driver_name for remaining devices as well 163 linux_bind_driver "$bdf" igb_uio 164 fi 165 fi 166 167 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 168 if [ -e "/dev/vfio/$iommu_group" ]; then 169 if [ -n "$TARGET_USER" ]; then 170 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 171 fi 172 fi 173} 174 175function linux_unbind_driver() { 176 local bdf="$1" 177 local ven_dev_id 178 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 179 local old_driver_name=${drivers_d["$bdf"]:-no driver} 180 181 if [[ $old_driver_name == "no driver" ]]; then 182 pci_dev_echo "$bdf" "Not bound to any driver" 183 return 0 184 fi 185 186 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 187 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 188 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 189 fi 190 191 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 192} 193 194function linux_hugetlbfs_mounts() { 195 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 196} 197 198function get_block_dev_from_bdf() { 199 local bdf=$1 200 local block 201 202 for block in /sys/block/*; do 203 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 204 echo "${block##*/}" 205 fi 206 done 207} 208 209function get_used_bdf_block_devs() { 210 local bdf=$1 211 local blocks block blockp dev mount holder 212 local used 213 214 hash lsblk &> /dev/null || return 1 215 blocks=($(get_block_dev_from_bdf "$bdf")) 216 217 for block in "${blocks[@]}"; do 218 # Check if the device is hold by some other, regardless if it's mounted 219 # or not. 220 for holder in "/sys/class/block/$block"*/holders/*; do 221 [[ -e $holder ]] || continue 222 blockp=${holder%/holders*} blockp=${blockp##*/} 223 if [[ -e $holder/slaves/$blockp ]]; then 224 used+=("holder@$blockp:${holder##*/}") 225 fi 226 done 227 while read -r dev mount; do 228 if [[ -e $mount ]]; then 229 used+=("mount@$block:$dev") 230 fi 231 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 232 if ((${#used[@]} == 0)); then 233 # Make sure we check if there's any valid data present on the target device 234 # regardless if it's being actively used or not. This is mainly done to make 235 # sure we don't miss more complex setups like ZFS pools, etc. 236 if block_in_use "$block" > /dev/null; then 237 used+=("data@$block") 238 fi 239 fi 240 done 241 242 if ((${#used[@]} > 0)); then 243 printf '%s\n' "${used[@]}" 244 fi 245} 246 247function collect_devices() { 248 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 249 250 local ids dev_type dev_id bdf bdfs in_use driver 251 252 ids+="PCI_DEVICE_ID_INTEL_IOAT" 253 ids+="|PCI_DEVICE_ID_INTEL_DSA" 254 ids+="|PCI_DEVICE_ID_INTEL_IAA" 255 ids+="|PCI_DEVICE_ID_VIRTIO" 256 ids+="|PCI_DEVICE_ID_INTEL_VMD" 257 ids+="|SPDK_PCI_CLASS_NVME" 258 259 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d 260 261 while read -r _ dev_type dev_id; do 262 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 263 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 264 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 265 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 266 for bdf in "${bdfs[@]}"; do 267 in_use=0 268 if [[ $1 != status ]]; then 269 if ! pci_can_use "$bdf"; then 270 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 271 in_use=1 272 fi 273 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 274 if ! verify_bdf_block_devs "$bdf"; then 275 in_use=1 276 fi 277 fi 278 if [[ $dev_type == vmd ]]; then 279 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 280 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 281 in_use=1 282 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 283 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 284 if [ "$mode" == "config" ]; then 285 cat <<- MESSAGE 286 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 287 which are attached to the kernel NVMe driver,the binding process may go faster 288 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 289 NVMe SSDs, and then run again to unbind the VMD devices." 290 MESSAGE 291 fi 292 fi 293 fi 294 fi 295 fi 296 eval "${dev_type}_d[$bdf]=$in_use" 297 all_devices_d["$bdf"]=$in_use 298 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 299 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 300 drivers_d["$bdf"]=${driver##*/} 301 fi 302 done 303 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 304} 305 306function collect_driver() { 307 local bdf=$1 308 local drivers driver 309 310 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 311 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 312 # Pick first entry in case multiple aliases are bound to a driver. 313 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 314 driver=${driver##*/} 315 else 316 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 317 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 318 [[ -n ${dsa_d["$bdf"]} ]] && driver=dsa 319 [[ -n ${iaa_d["$bdf"]} ]] && driver=iaa 320 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 321 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 322 fi 2> /dev/null 323 echo "$driver" 324} 325 326function verify_bdf_block_devs() { 327 local bdf=$1 328 local blknames 329 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 330 331 if ((${#blknames[@]} > 0)); then 332 local IFS="," 333 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 334 return 1 335 fi 336} 337 338function configure_linux_pci() { 339 local driver_path="" 340 driver_name="" 341 igb_uio_fallback="" 342 343 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 344 # igb_uio is a common driver to override with and it depends on uio. 345 modprobe uio || true 346 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 347 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 348 fi 349 fi 350 351 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 352 driver_name=none 353 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 354 driver_path="$DRIVER_OVERRIDE" 355 driver_name="${DRIVER_OVERRIDE##*/}" 356 # modprobe and the sysfs don't use the .ko suffix. 357 driver_name=${driver_name%.ko} 358 # path = name -> there is no path 359 if [[ "$driver_path" = "$driver_name" ]]; then 360 driver_path="" 361 fi 362 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 363 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 364 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 365 driver_name=vfio-pci 366 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 367 # should be done automatically by modprobe since this particular module should 368 # be a part of vfio-pci dependencies, however, on some distros, it seems that 369 # it's not the case. See #1689. 370 if modinfo vfio_iommu_type1 > /dev/null; then 371 modprobe vfio_iommu_type1 372 fi 373 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 374 driver_name=uio_pci_generic 375 elif [[ -e $igb_uio_fallback ]]; then 376 driver_path="$igb_uio_fallback" 377 driver_name="igb_uio" 378 echo "WARNING: uio_pci_generic not detected - using $driver_name" 379 else 380 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 381 return 1 382 fi 383 384 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 385 if [[ $driver_name != "none" ]]; then 386 if [[ -n "$driver_path" ]]; then 387 insmod $driver_path || true 388 else 389 modprobe $driver_name 390 fi 391 fi 392 393 for bdf in "${!all_devices_d[@]}"; do 394 if ((all_devices_d["$bdf"] == 0)); then 395 if [[ -n ${nvme_d["$bdf"]} ]]; then 396 # Some nvme controllers may take significant amount of time while being 397 # unbound from the driver. Put that task into background to speed up the 398 # whole process. Currently this is done only for the devices bound to the 399 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 400 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 401 linux_bind_driver "$bdf" "$driver_name" & 402 else 403 linux_bind_driver "$bdf" "$driver_name" 404 fi 405 fi 406 done 407 wait 408 409 echo "1" > "/sys/bus/pci/rescan" 410} 411 412function cleanup_linux() { 413 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 414 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 415 416 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 417 if [[ -d $XDG_RUNTIME_DIR ]]; then 418 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 419 fi 420 421 for dir in "${dirs_to_clean[@]}"; do 422 files_to_clean+=("$dir/"*) 423 done 424 file_locks+=(/var/tmp/spdk_pci_lock*) 425 426 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)) 427 files_to_clean+=("${file_locks[@]}") 428 if ((${#files_to_clean[@]} == 0)); then 429 echo "Clean" 430 return 0 431 fi 432 433 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) 434 435 if ((${#opened_files[@]} == 0)); then 436 echo "Can't get list of opened files!" 437 exit 1 438 fi 439 440 echo 'Cleaning' 441 for f in "${files_to_clean[@]}"; do 442 [[ -e $f ]] || continue 443 if [[ ${opened_files[*]} != *"$f"* ]]; then 444 echo "Removing: $f" 445 rm $f 446 else 447 echo "Still open: $f" 448 fi 449 done 450 451 for dir in "${dirs_to_clean[@]}"; do 452 [[ -d $dir ]] || continue 453 if [[ ${opened_files[*]} != *"$dir"* ]]; then 454 echo "Removing: $dir" 455 rmdir $dir 456 else 457 echo "Still open: $dir" 458 fi 459 done 460 echo "Clean" 461} 462 463check_hugepages_alloc() { 464 local hp_int=$1 465 local allocated_hugepages 466 467 allocated_hugepages=$(< "$hp_int") 468 469 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 470 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 471 return 0 472 fi 473 474 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 475 476 allocated_hugepages=$(< "$hp_int") 477 if ((allocated_hugepages < NRHUGE)); then 478 cat <<- ERROR 479 480 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 481 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 482 ERROR 483 return 1 484 fi 485} 486 487clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 488 489configure_linux_hugepages() { 490 local node system_nodes 491 local nodes_to_use nodes_hp 492 493 if [[ $CLEAR_HUGE == yes ]]; then 494 clear_hugepages 495 fi 496 497 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 498 clear_hugepages 499 check_hugepages_alloc /proc/sys/vm/nr_hugepages 500 return 0 501 fi 502 503 for node in /sys/devices/system/node/node*; do 504 [[ -e $node ]] || continue 505 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 506 done 507 508 if ((${#nodes[@]} == 0)); then 509 # No NUMA support? Fallback to common interface 510 check_hugepages_alloc /proc/sys/vm/nr_hugepages 511 return 0 512 fi 513 514 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 515 if ((${#nodes_to_use[@]} == 0)); then 516 nodes_to_use[0]=0 517 fi 518 519 # Align indexes with node ids 520 for node in "${!nodes_to_use[@]}"; do 521 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 522 eval "${nodes_to_use[node]}" 523 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 524 nodes_hp[nodes_to_use[node]]=$NRHUGE 525 fi 526 done 527 528 for node in "${!nodes_hp[@]}"; do 529 if [[ -z ${nodes[node]} ]]; then 530 echo "Node $node doesn't exist, ignoring" >&2 531 continue 532 fi 533 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 534 done 535} 536 537function configure_linux() { 538 configure_linux_pci 539 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 540 541 if [ -z "$hugetlbfs_mounts" ]; then 542 hugetlbfs_mounts=/mnt/huge 543 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 544 mkdir -p "$hugetlbfs_mounts" 545 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 546 fi 547 548 configure_linux_hugepages 549 550 if [ "$driver_name" = "vfio-pci" ]; then 551 if [ -n "$TARGET_USER" ]; then 552 for mount in $hugetlbfs_mounts; do 553 chown "$TARGET_USER" "$mount" 554 chmod g+w "$mount" 555 done 556 557 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 558 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 559 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 560 cat <<- MEMLOCK 561 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 562 563 This is the maximum amount of memory you will be 564 able to use with DPDK and VFIO if run as user "$TARGET_USER". 565 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 566 MEMLOCK 567 if ((MEMLOCK_AMNT < 65536)); then 568 echo "" 569 echo "## WARNING: memlock limit is less than 64MB" 570 echo -n "## DPDK with VFIO may not be able to initialize " 571 echo "if run as user \"$TARGET_USER\"." 572 fi 573 fi 574 fi 575 fi 576 577 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 578 # Some distros build msr as a module. Make sure it's loaded to ensure 579 # DPDK can easily figure out the TSC rate rather than relying on 100ms 580 # sleeps. 581 modprobe msr &> /dev/null || true 582 fi 583} 584 585function reset_linux_pci() { 586 # virtio 587 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 588 # Requires some more investigation - for example, some kernels do not seem to have 589 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 590 # underscore vs. dash right in the virtio_scsi name. 591 modprobe virtio-pci || true 592 for bdf in "${!all_devices_d[@]}"; do 593 ((all_devices_d["$bdf"] == 0)) || continue 594 595 driver=$(collect_driver "$bdf") 596 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 597 linux_bind_driver "$bdf" "$driver" 598 else 599 linux_unbind_driver "$bdf" 600 fi 601 done 602 603 echo "1" > "/sys/bus/pci/rescan" 604} 605 606function reset_linux() { 607 reset_linux_pci 608 for mount in $(linux_hugetlbfs_mounts); do 609 for hp in "$mount"/spdk*map_*; do 610 flock -n "$hp" true && rm -f "$hp" 611 done 612 done 613 rm -f /run/.spdk* 614} 615 616function status_linux() { 617 echo "Hugepages" >&2 618 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 619 620 numa_nodes=0 621 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 622 numa_nodes=$((numa_nodes + 1)) 623 free_pages=$(cat $path/free_hugepages) 624 all_pages=$(cat $path/nr_hugepages) 625 626 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 627 628 node=${BASH_REMATCH[1]} 629 huge_size=${BASH_REMATCH[2]} 630 631 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 632 done 633 634 # fall back to system-wide hugepages 635 if [ "$numa_nodes" = "0" ]; then 636 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 637 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 638 node="-" 639 huge_size="$HUGEPGSZ" 640 641 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 642 fi 643 644 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 645 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 646 647 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 648 649 for bdf in "${sorted_bdfs[@]}"; do 650 driver=${drivers_d["$bdf"]} 651 if [ "$numa_nodes" = "0" ]; then 652 node="-" 653 else 654 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 655 if ((node == -1)); then 656 node=unknown 657 fi 658 fi 659 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 660 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 661 else 662 name="-" 663 fi 664 665 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 666 blknames=($(get_block_dev_from_bdf "$bdf")) 667 else 668 blknames=("-") 669 fi 670 671 desc="" 672 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 673 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 674 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 675 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 676 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 677 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 678 679 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 680 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 681 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 682 done 683} 684 685function status_freebsd() { 686 local pci 687 688 status_print() ( 689 local type=$1 690 local dev driver 691 692 shift 693 694 for pci; do 695 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 696 "$type" \ 697 "$pci" \ 698 "${pci_ids_vendor["$pci"]}" \ 699 "${pci_ids_device["$pci"]}" \ 700 "${pci_bus_driver["$pci"]}" 701 done | sort -k2,2 702 ) 703 704 local contigmem=present 705 local contigmem_buffer_size 706 local contigmem_num_buffers 707 708 if ! kldstat -q -m contigmem; then 709 contigmem="not present" 710 fi 711 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 712 contigmem_buffer_size="not set" 713 fi 714 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 715 contigmem_num_buffers="not set" 716 fi 717 718 cat <<- BSD_INFO 719 Contigmem ($contigmem) 720 Buffer Size: $contigmem_buffer_size 721 Num Buffers: $contigmem_num_buffers 722 723 BSD_INFO 724 725 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 726 "Type" "BDF" "Vendor" "Device" "Driver" >&2 727 728 status_print "NVMe" "${!nvme_d[@]}" 729 status_print "I/OAT" "${!ioat_d[@]}" 730 status_print "DSA" "${!dsa_d[@]}" 731 status_print "IAA" "${!iaa_d[@]}" 732 status_print "VMD" "${!vmd_d[@]}" 733} 734 735function configure_freebsd_pci() { 736 local BDFS 737 738 BDFS+=("${!nvme_d[@]}") 739 BDFS+=("${!ioat_d[@]}") 740 BDFS+=("${!dsa_d[@]}") 741 BDFS+=("${!iaa_d[@]}") 742 BDFS+=("${!vmd_d[@]}") 743 744 # Drop the domain part from all the addresses 745 BDFS=("${BDFS[@]#*:}") 746 747 local IFS="," 748 kldunload nic_uio.ko || true 749 kenv hw.nic_uio.bdfs="${BDFS[*]}" 750 kldload nic_uio.ko 751} 752 753function configure_freebsd() { 754 if ! check_for_driver_freebsd; then 755 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 756 return 1 757 fi 758 configure_freebsd_pci 759 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 760 # previous value, unload contigmem so that we can reload with the new value. 761 if kldstat -q -m contigmem; then 762 # contigmem may be loaded, but the kernel environment doesn't have to 763 # be necessarily set at this point. If it isn't, kenv will fail to 764 # pick up the hw. options. Handle it. 765 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 766 contigmem_num_buffers=-1 767 fi 2> /dev/null 768 if ((contigmem_num_buffers != HUGEMEM / 256)); then 769 kldunload contigmem.ko 770 fi 771 fi 772 if ! kldstat -q -m contigmem; then 773 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 774 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 775 kldload contigmem.ko 776 fi 777} 778 779function reset_freebsd() { 780 kldunload contigmem.ko || true 781 kldunload nic_uio.ko || true 782} 783 784CMD=reset cache_pci_bus 785 786mode=$1 787 788if [ -z "$mode" ]; then 789 mode="config" 790fi 791 792: ${HUGEMEM:=2048} 793: ${PCI_ALLOWED:=""} 794: ${PCI_BLOCKED:=""} 795 796if [ -n "$NVME_ALLOWED" ]; then 797 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 798fi 799 800if [ -n "$SKIP_PCI" ]; then 801 PCI_ALLOWED="none" 802fi 803 804if [ -z "$TARGET_USER" ]; then 805 TARGET_USER="$SUDO_USER" 806 if [ -z "$TARGET_USER" ]; then 807 TARGET_USER=$(logname 2> /dev/null) || true 808 fi 809fi 810 811collect_devices "$mode" 812 813if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 814 # Note that this will wait only for the first block device attached to 815 # a given storage controller. For nvme this may miss some of the devs 816 # in case multiple namespaces are being in place. 817 # FIXME: Wait for nvme controller(s) to be in live state and determine 818 # number of configured namespaces, build list of potential block devs 819 # and pass them to sync_dev_uevents. Is it worth the effort? 820 bdfs_to_wait_for=() 821 for bdf in "${!all_devices_d[@]}"; do 822 ((all_devices_d["$bdf"] == 0)) || continue 823 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 824 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 825 bdfs_to_wait_for+=("$bdf") 826 fi 827 done 828 if ((${#bdfs_to_wait_for[@]} > 0)); then 829 echo "Waiting for block devices as requested" 830 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 831 "$rootdir/scripts/sync_dev_uevents.sh" \ 832 block/disk \ 833 "${bdfs_to_wait_for[@]}" & 834 sync_pid=$! 835 fi 836fi 837 838if [[ $os == Linux ]]; then 839 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 840 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 841 unset -v HUGEPGSZ 842 fi 843 844 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 845 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 846 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 847 848 if [ "$mode" == "config" ]; then 849 configure_linux 850 elif [ "$mode" == "cleanup" ]; then 851 cleanup_linux 852 clear_hugepages 853 elif [ "$mode" == "reset" ]; then 854 reset_linux 855 elif [ "$mode" == "status" ]; then 856 status_linux 857 elif [ "$mode" == "help" ]; then 858 usage $0 859 else 860 usage $0 "Invalid argument '$mode'" 861 fi 862else 863 if [ "$mode" == "config" ]; then 864 configure_freebsd 865 elif [ "$mode" == "reset" ]; then 866 reset_freebsd 867 elif [ "$mode" == "cleanup" ]; then 868 echo "setup.sh cleanup function not yet supported on $os" 869 elif [ "$mode" == "status" ]; then 870 status_freebsd 871 elif [ "$mode" == "help" ]; then 872 usage $0 873 else 874 usage $0 "Invalid argument '$mode'" 875 fi 876fi 877 878if [[ -e /proc/$sync_pid/status ]]; then 879 wait "$sync_pid" 880fi 881