1#!/usr/bin/env bash 2 3set -e 4shopt -s nullglob extglob 5 6os=$(uname -s) 7 8if [[ $os != Linux && $os != FreeBSD ]]; then 9 echo "Not supported platform ($os), aborting" 10 exit 1 11fi 12 13rootdir=$(readlink -f $(dirname $0))/.. 14source "$rootdir/scripts/common.sh" 15 16function usage() { 17 if [[ $os == Linux ]]; then 18 options="[config|reset|status|cleanup|help]" 19 else 20 options="[config|reset|help]" 21 fi 22 23 [[ -n $2 ]] && ( 24 echo "$2" 25 echo "" 26 ) 27 echo "Helper script for allocating hugepages and binding NVMe, I/OAT, VMD and Virtio devices" 28 echo "to a generic VFIO kernel driver. If VFIO is not available on the system, this script" 29 echo "will fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 30 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 31 echo "Usage: $(basename $1) $options" 32 echo 33 echo "$options - as following:" 34 echo "config Default mode. Allocate hugepages and bind PCI devices." 35 if [[ $os == Linux ]]; then 36 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 37 fi 38 echo "reset Rebind PCI devices back to their original drivers." 39 echo " Also cleanup any leftover spdk files/resources." 40 echo " Hugepage memory size will remain unchanged." 41 if [[ $os == Linux ]]; then 42 echo "status Print status of all SPDK-compatible devices on the system." 43 fi 44 echo "help Print this help message." 45 echo 46 echo "The following environment variables can be specified." 47 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 48 echo " For NUMA systems, the hugepages will be distributed on node0 by" 49 echo " default." 50 echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" 51 echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." 52 echo " Uses kernel's default for hugepages size." 53 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 54 echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" 55 echo " separated with comas. By default, NRHUGE will be applied on each node." 56 echo " Hugepages can be defined per node with e.g.:" 57 echo " HUGENODE='nodes_hp[0]=2048,nodes_hp[1]=512,2' - this will allocate" 58 echo " 2048 pages for node0, 512 for node1 and default NRHUGE for node2." 59 echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" 60 echo " setting is used." 61 echo "SHRINK_HUGE If set to 'yes', hugepages allocation won't be skipped in case" 62 echo " number of requested hugepages is lower from what's already" 63 echo " allocated. Doesn't apply when HUGE_EVEN_ALLOC is in use." 64 echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" 65 echo " be made prior to allocation". 66 echo "PCI_ALLOWED" 67 echo "PCI_BLOCKED Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." 68 echo " Each device must be specified as a full PCI address." 69 echo " E.g. PCI_ALLOWED=\"0000:01:00.0 0000:02:00.0\"" 70 echo " To block all PCI devices: PCI_ALLOWED=\"none\"" 71 echo " To allow all PCI devices except 0000:01:00.0: PCI_BLOCKED=\"0000:01:00.0\"" 72 echo " To allow only PCI device 0000:01:00.0: PCI_ALLOWED=\"0000:01:00.0\"" 73 echo " If PCI_ALLOWED and PCI_BLOCKED are empty or unset, all PCI devices" 74 echo " will be bound." 75 echo " Each device in PCI_BLOCKED will be ignored (driver won't be changed)." 76 echo " PCI_BLOCKED has precedence over PCI_ALLOWED." 77 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 78 echo " By default the current user will be used." 79 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 80 echo " bind devices to the given driver." 81 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=/home/public/dpdk/build/kmod/igb_uio.ko" 82 echo "PCI_BLOCK_SYNC_ON_RESET" 83 echo " If set in the environment, the attempt to wait for block devices associated" 84 echo " with given PCI device will be made upon reset" 85 exit 0 86} 87 88# In monolithic kernels the lsmod won't work. So 89# back that with a /sys/modules. We also check 90# /sys/bus/pci/drivers/ as neither lsmod nor /sys/modules might 91# contain needed info (like in Fedora-like OS). 92function check_for_driver() { 93 if [[ -z $1 ]]; then 94 return 0 95 fi 96 97 if lsmod | grep -q ${1//-/_}; then 98 return 1 99 fi 100 101 if [[ -d /sys/module/${1} || -d \ 102 /sys/module/${1//-/_} || -d \ 103 /sys/bus/pci/drivers/${1} || -d \ 104 /sys/bus/pci/drivers/${1//-/_} ]]; then 105 return 2 106 fi 107 return 0 108} 109 110function check_for_driver_freebsd() { 111 # Check if dpdk drivers (nic_uio, contigmem) are in the kernel's module path. 112 local search_paths path driver 113 IFS=";" read -ra search_paths < <(kldconfig -rU) 114 115 for driver in contigmem.ko nic_uio.ko; do 116 for path in "${search_paths[@]}"; do 117 [[ -f $path/$driver ]] && continue 2 118 done 119 return 1 120 done 121 return 0 122} 123 124function pci_dev_echo() { 125 local bdf="$1" 126 shift 127 echo "$bdf (${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}): $*" 128} 129 130function linux_bind_driver() { 131 bdf="$1" 132 driver_name="$2" 133 old_driver_name=${drivers_d["$bdf"]:-no driver} 134 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 135 136 if [[ $driver_name == "$old_driver_name" ]]; then 137 pci_dev_echo "$bdf" "Already using the $old_driver_name driver" 138 return 0 139 fi 140 141 if [[ $old_driver_name != "no driver" ]]; then 142 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 143 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 144 fi 145 146 pci_dev_echo "$bdf" "$old_driver_name -> $driver_name" 147 148 if [[ $driver_name == "none" ]]; then 149 return 0 150 fi 151 152 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 153 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 154 155 if [[ $driver_name == uio_pci_generic ]] && ! check_for_driver igb_uio; then 156 # Check if the uio_pci_generic driver is broken as it might be in 157 # some 4.18.x kernels (see centos8 for instance) - if our device 158 # didn't get a proper uio entry, fallback to igb_uio 159 if [[ ! -e /sys/bus/pci/devices/$bdf/uio ]]; then 160 pci_dev_echo "$bdf" "uio_pci_generic potentially broken, moving to igb_uio" 161 drivers_d["$bdf"]="no driver" 162 # This call will override $driver_name for remaining devices as well 163 linux_bind_driver "$bdf" igb_uio 164 fi 165 fi 166 167 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 168 if [ -e "/dev/vfio/$iommu_group" ]; then 169 if [ -n "$TARGET_USER" ]; then 170 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 171 fi 172 fi 173} 174 175function linux_unbind_driver() { 176 local bdf="$1" 177 local ven_dev_id 178 ven_dev_id="${pci_ids_vendor["$bdf"]#0x} ${pci_ids_device["$bdf"]#0x}" 179 local old_driver_name=${drivers_d["$bdf"]:-no driver} 180 181 if [[ $old_driver_name == "no driver" ]]; then 182 pci_dev_echo "$bdf" "Not bound to any driver" 183 return 0 184 fi 185 186 if [[ -e /sys/bus/pci/drivers/$old_driver_name ]]; then 187 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$old_driver_name/remove_id" 2> /dev/null || true 188 echo "$bdf" > "/sys/bus/pci/drivers/$old_driver_name/unbind" 189 fi 190 191 pci_dev_echo "$bdf" "$old_driver_name -> no driver" 192} 193 194function linux_hugetlbfs_mounts() { 195 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 196} 197 198function get_block_dev_from_bdf() { 199 local bdf=$1 200 local block 201 202 for block in /sys/block/*; do 203 if [[ $(readlink -f "$block/device") == *"/$bdf/"* ]]; then 204 echo "${block##*/}" 205 fi 206 done 207} 208 209function get_used_bdf_block_devs() { 210 local bdf=$1 211 local blocks block blockp dev mount holder 212 local used 213 214 hash lsblk &> /dev/null || return 1 215 blocks=($(get_block_dev_from_bdf "$bdf")) 216 217 for block in "${blocks[@]}"; do 218 # Check if the device is hold by some other, regardless if it's mounted 219 # or not. 220 for holder in "/sys/class/block/$block"*/holders/*; do 221 [[ -e $holder ]] || continue 222 blockp=${holder%/holders*} blockp=${blockp##*/} 223 if [[ -e $holder/slaves/$blockp ]]; then 224 used+=("holder@$blockp:${holder##*/}") 225 fi 226 done 227 while read -r dev mount; do 228 if [[ -e $mount ]]; then 229 used+=("mount@$block:$dev") 230 fi 231 done < <(lsblk -l -n -o NAME,MOUNTPOINT "/dev/$block") 232 if ((${#used[@]} == 0)); then 233 # Make sure we check if there's any valid data present on the target device 234 # regardless if it's being actively used or not. This is mainly done to make 235 # sure we don't miss more complex setups like ZFS pools, etc. 236 if block_in_use "$block" > /dev/null; then 237 used+=("data@$block") 238 fi 239 fi 240 done 241 242 if ((${#used[@]} > 0)); then 243 printf '%s\n' "${used[@]}" 244 fi 245} 246 247function collect_devices() { 248 # NVMe, IOAT, DSA, IAA, VIRTIO, VMD 249 250 local ids dev_type dev_id bdf bdfs in_use driver 251 252 ids+="PCI_DEVICE_ID_INTEL_IOAT" 253 ids+="|PCI_DEVICE_ID_INTEL_DSA" 254 ids+="|PCI_DEVICE_ID_INTEL_IAA" 255 ids+="|PCI_DEVICE_ID_VIRTIO" 256 ids+="|PCI_DEVICE_ID_INTEL_VMD" 257 ids+="|SPDK_PCI_CLASS_NVME" 258 259 local -gA nvme_d ioat_d dsa_d iaa_d virtio_d vmd_d all_devices_d drivers_d 260 261 while read -r _ dev_type dev_id; do 262 bdfs=(${pci_bus_cache["0x8086:$dev_id"]}) 263 [[ $dev_type == *NVME* ]] && bdfs=(${pci_bus_cache["$dev_id"]}) 264 [[ $dev_type == *VIRT* ]] && bdfs=(${pci_bus_cache["0x1af4:$dev_id"]}) 265 [[ $dev_type =~ (NVME|IOAT|DSA|IAA|VIRTIO|VMD) ]] && dev_type=${BASH_REMATCH[1],,} 266 for bdf in "${bdfs[@]}"; do 267 in_use=0 268 if [[ $1 != status ]]; then 269 if ! pci_can_use "$bdf"; then 270 pci_dev_echo "$bdf" "Skipping denied controller at $bdf" 271 in_use=1 272 fi 273 if [[ $dev_type == nvme || $dev_type == virtio ]]; then 274 if ! verify_bdf_block_devs "$bdf"; then 275 in_use=1 276 fi 277 fi 278 if [[ $dev_type == vmd ]]; then 279 if [[ $PCI_ALLOWED != *"$bdf"* ]]; then 280 pci_dev_echo "$bdf" "Skipping not allowed VMD controller at $bdf" 281 in_use=1 282 elif [[ " ${drivers_d[*]} " =~ "nvme" ]]; then 283 if [[ "${DRIVER_OVERRIDE}" != "none" ]]; then 284 if [ "$mode" == "config" ]; then 285 cat <<- MESSAGE 286 Binding new driver to VMD device. If there are NVMe SSDs behind the VMD endpoint 287 which are attached to the kernel NVMe driver,the binding process may go faster 288 if you first run this script with DRIVER_OVERRIDE="none" to unbind only the 289 NVMe SSDs, and then run again to unbind the VMD devices." 290 MESSAGE 291 fi 292 fi 293 fi 294 fi 295 fi 296 eval "${dev_type}_d[$bdf]=$in_use" 297 all_devices_d["$bdf"]=$in_use 298 if [[ -e /sys/bus/pci/devices/$bdf/driver ]]; then 299 driver=$(readlink -f "/sys/bus/pci/devices/$bdf/driver") 300 drivers_d["$bdf"]=${driver##*/} 301 fi 302 done 303 done < <(grep -E "$ids" "$rootdir/include/spdk/pci_ids.h") 304} 305 306function collect_driver() { 307 local bdf=$1 308 local drivers driver 309 310 if [[ -e /sys/bus/pci/devices/$bdf/modalias ]] \ 311 && drivers=($(modprobe -R "$(< "/sys/bus/pci/devices/$bdf/modalias")")); then 312 # Pick first entry in case multiple aliases are bound to a driver. 313 driver=$(readlink -f "/sys/module/${drivers[0]}/drivers/pci:"*) 314 driver=${driver##*/} 315 else 316 [[ -n ${nvme_d["$bdf"]} ]] && driver=nvme 317 [[ -n ${ioat_d["$bdf"]} ]] && driver=ioatdma 318 [[ -n ${dsa_d["$bdf"]} ]] && driver=dsa 319 [[ -n ${iaa_d["$bdf"]} ]] && driver=iaa 320 [[ -n ${virtio_d["$bdf"]} ]] && driver=virtio-pci 321 [[ -n ${vmd_d["$bdf"]} ]] && driver=vmd 322 fi 2> /dev/null 323 echo "$driver" 324} 325 326function verify_bdf_block_devs() { 327 local bdf=$1 328 local blknames 329 blknames=($(get_used_bdf_block_devs "$bdf")) || return 1 330 331 if ((${#blknames[@]} > 0)); then 332 local IFS="," 333 pci_dev_echo "$bdf" "Active devices: ${blknames[*]}, so not binding PCI dev" 334 return 1 335 fi 336} 337 338function configure_linux_pci() { 339 local driver_path="" 340 driver_name="" 341 igb_uio_fallback="" 342 343 if [[ -r "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" ]]; then 344 # igb_uio is a common driver to override with and it depends on uio. 345 modprobe uio || true 346 if ! check_for_driver igb_uio || insmod "$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko"; then 347 igb_uio_fallback="$rootdir/dpdk/build-tmp/kernel/linux/igb_uio/igb_uio.ko" 348 fi 349 fi 350 351 if [[ "${DRIVER_OVERRIDE}" == "none" ]]; then 352 driver_name=none 353 elif [[ -n "${DRIVER_OVERRIDE}" ]]; then 354 driver_path="$DRIVER_OVERRIDE" 355 driver_name="${DRIVER_OVERRIDE##*/}" 356 # modprobe and the sysfs don't use the .ko suffix. 357 driver_name=${driver_name%.ko} 358 # path = name -> there is no path 359 if [[ "$driver_path" = "$driver_name" ]]; then 360 driver_path="" 361 fi 362 elif [[ -n "$(ls /sys/kernel/iommu_groups)" || (-e \ 363 /sys/module/vfio/parameters/enable_unsafe_noiommu_mode && \ 364 "$(cat /sys/module/vfio/parameters/enable_unsafe_noiommu_mode)" == "Y") ]]; then 365 driver_name=vfio-pci 366 # Just in case, attempt to load VFIO_IOMMU_TYPE1 module into the kernel - this 367 # should be done automatically by modprobe since this particular module should 368 # be a part of vfio-pci dependencies, however, on some distros, it seems that 369 # it's not the case. See #1689. 370 if modinfo vfio_iommu_type1 > /dev/null; then 371 modprobe vfio_iommu_type1 372 fi 373 elif ! check_for_driver uio_pci_generic || modinfo uio_pci_generic > /dev/null 2>&1; then 374 driver_name=uio_pci_generic 375 elif [[ -e $igb_uio_fallback ]]; then 376 driver_path="$igb_uio_fallback" 377 driver_name="igb_uio" 378 echo "WARNING: uio_pci_generic not detected - using $driver_name" 379 else 380 echo "No valid drivers found [vfio-pci, uio_pci_generic, igb_uio]. Please enable one of the kernel modules." 381 return 1 382 fi 383 384 # modprobe assumes the directory of the module. If the user passes in a path, we should use insmod 385 if [[ $driver_name != "none" ]]; then 386 if [[ -n "$driver_path" ]]; then 387 insmod $driver_path || true 388 else 389 modprobe $driver_name 390 fi 391 fi 392 393 for bdf in "${!all_devices_d[@]}"; do 394 if ((all_devices_d["$bdf"] == 0)); then 395 if [[ -n ${nvme_d["$bdf"]} ]]; then 396 # Some nvme controllers may take significant amount of time while being 397 # unbound from the driver. Put that task into background to speed up the 398 # whole process. Currently this is done only for the devices bound to the 399 # nvme driver as other, i.e., ioatdma's, trigger a kernel BUG when being 400 # unbound in parallel. See https://bugzilla.kernel.org/show_bug.cgi?id=209041. 401 linux_bind_driver "$bdf" "$driver_name" & 402 else 403 linux_bind_driver "$bdf" "$driver_name" 404 fi 405 fi 406 done 407 wait 408 409 echo "1" > "/sys/bus/pci/rescan" 410} 411 412function cleanup_linux() { 413 local dirs_to_clean=() files_to_clean=() opened_files=() file_locks=() 414 local match_spdk="spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevio|bdevperf|vhost_fuzz|nvme_fuzz|accel_perf|bdev_svc" 415 416 dirs_to_clean=({/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) 417 if [[ -d $XDG_RUNTIME_DIR ]]; then 418 dirs_to_clean+=("$XDG_RUNTIME_DIR/dpdk/spdk"{,_pid}+([0-9])) 419 fi 420 421 for dir in "${dirs_to_clean[@]}"; do 422 files_to_clean+=("$dir/"*) 423 done 424 file_locks+=(/var/tmp/spdk_pci_lock*) 425 426 files_to_clean+=(/dev/shm/@(@($match_spdk)_trace|spdk_iscsi_conns)) 427 files_to_clean+=("${file_locks[@]}") 428 429 # This may fail in case path that readlink attempts to resolve suddenly 430 # disappears (as it may happen with terminating processes). 431 opened_files+=($(readlink -f /proc/+([0-9])/fd/+([0-9]))) || true 432 433 if ((${#opened_files[@]} == 0)); then 434 echo "Can't get list of opened files!" 435 exit 1 436 fi 437 438 echo 'Cleaning' 439 for f in "${files_to_clean[@]}"; do 440 [[ -e $f ]] || continue 441 if [[ ${opened_files[*]} != *"$f"* ]]; then 442 echo "Removing: $f" 443 rm $f 444 else 445 echo "Still open: $f" 446 fi 447 done 448 449 for dir in "${dirs_to_clean[@]}"; do 450 [[ -d $dir ]] || continue 451 if [[ ${opened_files[*]} != *"$dir"* ]]; then 452 echo "Removing: $dir" 453 rmdir $dir 454 else 455 echo "Still open: $dir" 456 fi 457 done 458 echo "Clean" 459} 460 461check_hugepages_alloc() { 462 local hp_int=$1 463 local allocated_hugepages 464 465 allocated_hugepages=$(< "$hp_int") 466 467 if ((NRHUGE <= allocated_hugepages)) && [[ $SHRINK_HUGE != yes ]]; then 468 echo "INFO: Requested $NRHUGE hugepages but $allocated_hugepages already allocated ${2:+on node$2}" 469 return 0 470 fi 471 472 echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" 473 474 allocated_hugepages=$(< "$hp_int") 475 if ((allocated_hugepages < NRHUGE)); then 476 cat <<- ERROR 477 478 ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. 479 ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. 480 ERROR 481 return 1 482 fi 483} 484 485clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } 486 487configure_linux_hugepages() { 488 local node system_nodes 489 local nodes_to_use nodes_hp 490 491 if [[ $CLEAR_HUGE == yes ]]; then 492 clear_hugepages 493 fi 494 495 if [[ $HUGE_EVEN_ALLOC == yes ]]; then 496 clear_hugepages 497 check_hugepages_alloc /proc/sys/vm/nr_hugepages 498 return 0 499 fi 500 501 for node in /sys/devices/system/node/node*; do 502 [[ -e $node ]] || continue 503 nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages 504 done 505 506 if ((${#nodes[@]} == 0)); then 507 # No NUMA support? Fallback to common interface 508 check_hugepages_alloc /proc/sys/vm/nr_hugepages 509 return 0 510 fi 511 512 IFS="," read -ra nodes_to_use <<< "$HUGENODE" 513 if ((${#nodes_to_use[@]} == 0)); then 514 nodes_to_use[0]=0 515 fi 516 517 # Align indexes with node ids 518 for node in "${!nodes_to_use[@]}"; do 519 if [[ ${nodes_to_use[node]} =~ ^nodes_hp\[[0-9]+\]= ]]; then 520 eval "${nodes_to_use[node]}" 521 elif [[ ${nodes_to_use[node]} =~ ^[0-9]+$ ]]; then 522 nodes_hp[nodes_to_use[node]]=$NRHUGE 523 fi 524 done 525 526 for node in "${!nodes_hp[@]}"; do 527 if [[ -z ${nodes[node]} ]]; then 528 echo "Node $node doesn't exist, ignoring" >&2 529 continue 530 fi 531 NRHUGE=${nodes_hp[node]:-$NRHUGE} check_hugepages_alloc "${nodes[node]}" "$node" 532 done 533} 534 535function configure_linux() { 536 configure_linux_pci 537 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 538 539 if [ -z "$hugetlbfs_mounts" ]; then 540 hugetlbfs_mounts=/mnt/huge 541 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 542 mkdir -p "$hugetlbfs_mounts" 543 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 544 fi 545 546 configure_linux_hugepages 547 548 if [ "$driver_name" = "vfio-pci" ]; then 549 if [ -n "$TARGET_USER" ]; then 550 for mount in $hugetlbfs_mounts; do 551 chown "$TARGET_USER" "$mount" 552 chmod g+w "$mount" 553 done 554 555 MEMLOCK_AMNT=$(su "$TARGET_USER" -c "ulimit -l") 556 if [[ $MEMLOCK_AMNT != "unlimited" ]]; then 557 MEMLOCK_MB=$((MEMLOCK_AMNT / 1024)) 558 cat <<- MEMLOCK 559 "$TARGET_USER" user memlock limit: $MEMLOCK_MB MB 560 561 This is the maximum amount of memory you will be 562 able to use with DPDK and VFIO if run as user "$TARGET_USER". 563 To change this, please adjust limits.conf memlock limit for user "$TARGET_USER". 564 MEMLOCK 565 if ((MEMLOCK_AMNT < 65536)); then 566 echo "" 567 echo "## WARNING: memlock limit is less than 64MB" 568 echo -n "## DPDK with VFIO may not be able to initialize " 569 echo "if run as user \"$TARGET_USER\"." 570 fi 571 fi 572 fi 573 fi 574 575 if [ $(uname -i) == "x86_64" ] && [ ! -e /dev/cpu/0/msr ]; then 576 # Some distros build msr as a module. Make sure it's loaded to ensure 577 # DPDK can easily figure out the TSC rate rather than relying on 100ms 578 # sleeps. 579 modprobe msr &> /dev/null || true 580 fi 581} 582 583function reset_linux_pci() { 584 # virtio 585 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 586 # Requires some more investigation - for example, some kernels do not seem to have 587 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 588 # underscore vs. dash right in the virtio_scsi name. 589 modprobe virtio-pci || true 590 for bdf in "${!all_devices_d[@]}"; do 591 ((all_devices_d["$bdf"] == 0)) || continue 592 593 driver=$(collect_driver "$bdf") 594 if [[ -n $driver ]] && ! check_for_driver "$driver"; then 595 linux_bind_driver "$bdf" "$driver" 596 else 597 linux_unbind_driver "$bdf" 598 fi 599 done 600 601 echo "1" > "/sys/bus/pci/rescan" 602} 603 604function reset_linux() { 605 reset_linux_pci 606 for mount in $(linux_hugetlbfs_mounts); do 607 for hp in "$mount"/spdk*map_*; do 608 flock -n "$hp" true && rm -f "$hp" 609 done 610 done 611 rm -f /run/.spdk* 612} 613 614function status_linux() { 615 echo "Hugepages" >&2 616 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" >&2 617 618 numa_nodes=0 619 for path in /sys/devices/system/node/node*/hugepages/hugepages-*/; do 620 numa_nodes=$((numa_nodes + 1)) 621 free_pages=$(cat $path/free_hugepages) 622 all_pages=$(cat $path/nr_hugepages) 623 624 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 625 626 node=${BASH_REMATCH[1]} 627 huge_size=${BASH_REMATCH[2]} 628 629 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 630 done 631 632 # fall back to system-wide hugepages 633 if [ "$numa_nodes" = "0" ]; then 634 free_pages=$(grep HugePages_Free /proc/meminfo | awk '{ print $2 }') 635 all_pages=$(grep HugePages_Total /proc/meminfo | awk '{ print $2 }') 636 node="-" 637 huge_size="$HUGEPGSZ" 638 639 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 640 fi 641 642 printf '\n%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 643 "Type" "BDF" "Vendor" "Device" "NUMA" "Driver" "Device" "Block devices" >&2 644 645 sorted_bdfs=($(printf '%s\n' "${!all_devices_d[@]}" | sort)) 646 647 for bdf in "${sorted_bdfs[@]}"; do 648 driver=${drivers_d["$bdf"]} 649 if [ "$numa_nodes" = "0" ]; then 650 node="-" 651 else 652 node=$(cat /sys/bus/pci/devices/$bdf/numa_node) 653 if ((node == -1)); then 654 node=unknown 655 fi 656 fi 657 if [ "$driver" = "nvme" ] && [ -d /sys/bus/pci/devices/$bdf/nvme ]; then 658 name=$(ls /sys/bus/pci/devices/$bdf/nvme) 659 else 660 name="-" 661 fi 662 663 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 664 blknames=($(get_block_dev_from_bdf "$bdf")) 665 else 666 blknames=("-") 667 fi 668 669 desc="" 670 desc=${desc:-${nvme_d["$bdf"]:+NVMe}} 671 desc=${desc:-${ioat_d["$bdf"]:+I/OAT}} 672 desc=${desc:-${dsa_d["$bdf"]:+DSA}} 673 desc=${desc:-${iaa_d["$bdf"]:+IAA}} 674 desc=${desc:-${virtio_d["$bdf"]:+virtio}} 675 desc=${desc:-${vmd_d["$bdf"]:+VMD}} 676 677 printf '%-8s %-15s %-6s %-6s %-7s %-16s %-10s %s\n' \ 678 "$desc" "$bdf" "${pci_ids_vendor["$bdf"]#0x}" "${pci_ids_device["$bdf"]#0x}" \ 679 "$node" "${driver:--}" "${name:-}" "${blknames[*]:--}" 680 done 681} 682 683function status_freebsd() { 684 local pci 685 686 status_print() ( 687 local type=$1 688 local dev driver 689 690 shift 691 692 for pci; do 693 printf '%-8s %-15s %-6s %-6s %-16s\n' \ 694 "$type" \ 695 "$pci" \ 696 "${pci_ids_vendor["$pci"]}" \ 697 "${pci_ids_device["$pci"]}" \ 698 "${pci_bus_driver["$pci"]}" 699 done | sort -k2,2 700 ) 701 702 local contigmem=present 703 local contigmem_buffer_size 704 local contigmem_num_buffers 705 706 if ! kldstat -q -m contigmem; then 707 contigmem="not present" 708 fi 709 if ! contigmem_buffer_size=$(kenv hw.contigmem.buffer_size 2> /dev/null); then 710 contigmem_buffer_size="not set" 711 fi 712 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers 2> /dev/null); then 713 contigmem_num_buffers="not set" 714 fi 715 716 cat <<- BSD_INFO 717 Contigmem ($contigmem) 718 Buffer Size: $contigmem_buffer_size 719 Num Buffers: $contigmem_num_buffers 720 721 BSD_INFO 722 723 printf '\n%-8s %-15s %-6s %-6s %-16s\n' \ 724 "Type" "BDF" "Vendor" "Device" "Driver" >&2 725 726 status_print "NVMe" "${!nvme_d[@]}" 727 status_print "I/OAT" "${!ioat_d[@]}" 728 status_print "DSA" "${!dsa_d[@]}" 729 status_print "IAA" "${!iaa_d[@]}" 730 status_print "VMD" "${!vmd_d[@]}" 731} 732 733function configure_freebsd_pci() { 734 local BDFS 735 736 BDFS+=("${!nvme_d[@]}") 737 BDFS+=("${!ioat_d[@]}") 738 BDFS+=("${!dsa_d[@]}") 739 BDFS+=("${!iaa_d[@]}") 740 BDFS+=("${!vmd_d[@]}") 741 742 # Drop the domain part from all the addresses 743 BDFS=("${BDFS[@]#*:}") 744 745 local IFS="," 746 kldunload nic_uio.ko || true 747 kenv hw.nic_uio.bdfs="${BDFS[*]}" 748 kldload nic_uio.ko 749} 750 751function configure_freebsd() { 752 if ! check_for_driver_freebsd; then 753 echo "DPDK drivers (contigmem and/or nic_uio) are missing, aborting" >&2 754 return 1 755 fi 756 configure_freebsd_pci 757 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 758 # previous value, unload contigmem so that we can reload with the new value. 759 if kldstat -q -m contigmem; then 760 # contigmem may be loaded, but the kernel environment doesn't have to 761 # be necessarily set at this point. If it isn't, kenv will fail to 762 # pick up the hw. options. Handle it. 763 if ! contigmem_num_buffers=$(kenv hw.contigmem.num_buffers); then 764 contigmem_num_buffers=-1 765 fi 2> /dev/null 766 if ((contigmem_num_buffers != HUGEMEM / 256)); then 767 kldunload contigmem.ko 768 fi 769 fi 770 if ! kldstat -q -m contigmem; then 771 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 772 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 773 kldload contigmem.ko 774 fi 775} 776 777function reset_freebsd() { 778 kldunload contigmem.ko || true 779 kldunload nic_uio.ko || true 780} 781 782CMD=reset cache_pci_bus 783 784mode=$1 785 786if [ -z "$mode" ]; then 787 mode="config" 788fi 789 790: ${HUGEMEM:=2048} 791: ${PCI_ALLOWED:=""} 792: ${PCI_BLOCKED:=""} 793 794if [ -n "$NVME_ALLOWED" ]; then 795 PCI_ALLOWED="$PCI_ALLOWED $NVME_ALLOWED" 796fi 797 798if [ -n "$SKIP_PCI" ]; then 799 PCI_ALLOWED="none" 800fi 801 802if [ -z "$TARGET_USER" ]; then 803 TARGET_USER="$SUDO_USER" 804 if [ -z "$TARGET_USER" ]; then 805 TARGET_USER=$(logname 2> /dev/null) || true 806 fi 807fi 808 809collect_devices "$mode" 810 811if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then 812 # Note that this will wait only for the first block device attached to 813 # a given storage controller. For nvme this may miss some of the devs 814 # in case multiple namespaces are being in place. 815 # FIXME: Wait for nvme controller(s) to be in live state and determine 816 # number of configured namespaces, build list of potential block devs 817 # and pass them to sync_dev_uevents. Is it worth the effort? 818 bdfs_to_wait_for=() 819 for bdf in "${!all_devices_d[@]}"; do 820 ((all_devices_d["$bdf"] == 0)) || continue 821 if [[ -n ${nvme_d["$bdf"]} || -n ${virtio_d["$bdf"]} ]]; then 822 [[ $(collect_driver "$bdf") != "${drivers_d["$bdf"]}" ]] || continue 823 bdfs_to_wait_for+=("$bdf") 824 fi 825 done 826 if ((${#bdfs_to_wait_for[@]} > 0)); then 827 echo "Waiting for block devices as requested" 828 export UEVENT_TIMEOUT=5 DEVPATH_LOOKUP=yes DEVPATH_SUBSYSTEM=pci 829 "$rootdir/scripts/sync_dev_uevents.sh" \ 830 block/disk \ 831 "${bdfs_to_wait_for[@]}" & 832 sync_pid=$! 833 fi 834fi 835 836if [[ $os == Linux ]]; then 837 if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then 838 echo "${HUGEPGSZ}kB is not supported by the running kernel, ignoring" >&2 839 unset -v HUGEPGSZ 840 fi 841 842 HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} 843 HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) 844 : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))} 845 846 if [ "$mode" == "config" ]; then 847 configure_linux 848 elif [ "$mode" == "cleanup" ]; then 849 cleanup_linux 850 clear_hugepages 851 elif [ "$mode" == "reset" ]; then 852 reset_linux 853 elif [ "$mode" == "status" ]; then 854 status_linux 855 elif [ "$mode" == "help" ]; then 856 usage $0 857 else 858 usage $0 "Invalid argument '$mode'" 859 fi 860else 861 if [ "$mode" == "config" ]; then 862 configure_freebsd 863 elif [ "$mode" == "reset" ]; then 864 reset_freebsd 865 elif [ "$mode" == "cleanup" ]; then 866 echo "setup.sh cleanup function not yet supported on $os" 867 elif [ "$mode" == "status" ]; then 868 status_freebsd 869 elif [ "$mode" == "help" ]; then 870 usage $0 871 else 872 usage $0 "Invalid argument '$mode'" 873 fi 874fi 875 876if [[ -e /proc/$sync_pid/status ]]; then 877 wait "$sync_pid" 878fi 879