1#!/usr/bin/env bash 2 3set -e 4 5rootdir=$(readlink -f $(dirname $0))/.. 6source "$rootdir/scripts/common.sh" 7 8function usage() 9{ 10 if [ `uname` = Linux ]; then 11 options="[config|reset|status|cleanup|help]" 12 else 13 options="[config|reset|help]" 14 fi 15 16 [[ ! -z $2 ]] && ( echo "$2"; echo ""; ) 17 echo "Helper script for allocating hugepages and binding NVMe, I/OAT and Virtio devices to" 18 echo "a generic VFIO kernel driver. If VFIO is not available on the system, this script will" 19 echo "fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 20 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 21 echo "Usage: $(basename $1) $options" 22 echo 23 echo "$options - as following:" 24 echo "config Default mode. Allocate hugepages and bind PCI devices." 25 if [ `uname` = Linux ]; then 26 echo "cleanup Remove any orphaned files that can be left in the system after SPDK application exit" 27 fi 28 echo "reset Rebind PCI devices back to their original drivers." 29 echo " Also cleanup any leftover spdk files/resources." 30 echo " Hugepage memory size will remain unchanged." 31 if [ `uname` = Linux ]; then 32 echo "status Print status of all SPDK-compatible devices on the system." 33 fi 34 echo "help Print this help message." 35 echo 36 echo "The following environment variables can be specified." 37 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 38 echo " For NUMA systems, the hugepages will be evenly distributed" 39 echo " between CPU nodes" 40 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 41 echo "HUGENODE Specific NUMA node to allocate hugepages on. To allocate" 42 echo " hugepages on multiple nodes run this script multiple times -" 43 echo " once for each node." 44 echo "PCI_WHITELIST Whitespace separated list of PCI devices (NVMe, I/OAT, Virtio) to bind." 45 echo " Each device must be specified as a full PCI address." 46 echo " E.g. PCI_WHITELIST=\"0000:01:00.0 0000:02:00.0\"" 47 echo " To blacklist all PCI devices use a non-valid address." 48 echo " E.g. PCI_WHITELIST=\"none\"" 49 echo " If empty or unset, all PCI devices will be bound." 50 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 51 echo " By default the current user will be used." 52 echo "DRIVER_OVERRIDE Disable automatic vfio-pci/uio_pci_generic selection and forcefully" 53 echo " bind devices to the given driver." 54 echo " E.g. DRIVER_OVERRIDE=uio_pci_generic or DRIVER_OVERRIDE=vfio-pci" 55 exit 0 56} 57 58# In monolithic kernels the lsmod won't work. So 59# back that with a /sys/modules check. Return a different code for 60# built-in vs module just in case we want that down the road. 61function check_for_driver { 62 $(lsmod | grep $1 > /dev/null) 63 if [ $? -eq 0 ]; then 64 return 1 65 else 66 if [[ -d /sys/module/$1 ]]; then 67 return 2 68 else 69 return 0 70 fi 71 fi 72 return 0 73} 74 75function pci_can_bind() { 76 if [[ ${#PCI_WHITELIST[@]} == 0 ]]; then 77 #no whitelist specified, bind all devices 78 return 1 79 fi 80 81 for i in ${PCI_WHITELIST[@]} 82 do 83 if [ "$i" == "$1" ] ; then 84 return 1 85 fi 86 done 87 return 0 88} 89 90function linux_bind_driver() { 91 bdf="$1" 92 driver_name="$2" 93 old_driver_name="no driver" 94 ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /') 95 96 if [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then 97 old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver)) 98 99 if [ "$driver_name" = "$old_driver_name" ]; then 100 return 0 101 fi 102 103 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 104 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 105 fi 106 107 echo "$bdf ($ven_dev_id): $old_driver_name -> $driver_name" 108 109 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 110 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 111 112 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 113 if [ -e "/dev/vfio/$iommu_group" ]; then 114 if [ -n "$TARGET_USER" ]; then 115 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 116 fi 117 fi 118} 119 120function linux_unbind_driver() { 121 bdf="$1" 122 ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /') 123 124 if ! [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then 125 return 0 126 fi 127 128 old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver)) 129 130 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 131 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 132 echo "$bdf ($ven_dev_id): $old_driver_name -> no driver" 133} 134 135function linux_hugetlbfs_mounts() { 136 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 137} 138 139function get_nvme_name_from_bdf { 140 set +e 141 nvme_devs=`lsblk -d --output NAME | grep "^nvme"` 142 set -e 143 for dev in $nvme_devs; do 144 link_name=$(readlink /sys/block/$dev/device/device) || true 145 if [ -z "$link_name" ]; then 146 link_name=$(readlink /sys/block/$dev/device) 147 fi 148 link_bdf=$(basename "$link_name") 149 if [ "$link_bdf" = "$1" ]; then 150 eval "$2=$dev" 151 return 152 fi 153 done 154} 155 156function get_virtio_names_from_bdf { 157 blk_devs=`lsblk --nodeps --output NAME` 158 virtio_names='' 159 160 for dev in $blk_devs; do 161 if readlink "/sys/block/$dev" | grep -q "$1"; then 162 virtio_names="$virtio_names $dev" 163 fi 164 done 165 166 eval "$2='$virtio_names'" 167} 168 169function configure_linux_pci { 170 if [ -z "${DRIVER_OVERRIDE}" ]; then 171 driver_name=vfio-pci 172 if [ -z "$(ls /sys/kernel/iommu_groups)" ]; then 173 # No IOMMU. Use uio. 174 driver_name=uio_pci_generic 175 fi 176 else 177 driver_name="${DRIVER_OVERRIDE}" 178 fi 179 180 # NVMe 181 modprobe $driver_name 182 for bdf in $(iter_pci_class_code 01 08 02); do 183 blkname='' 184 get_nvme_name_from_bdf "$bdf" blkname 185 if pci_can_bind $bdf == "0" ; then 186 echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)" 187 continue 188 fi 189 if [ "$blkname" != "" ]; then 190 mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w) 191 else 192 mountpoints="0" 193 fi 194 if [ "$mountpoints" = "0" ]; then 195 linux_bind_driver "$bdf" "$driver_name" 196 else 197 echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf 198 fi 199 done 200 201 # IOAT 202 TMP=`mktemp` 203 #collect all the device_id info of ioat devices. 204 grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ 205 | awk -F"x" '{print $2}' > $TMP 206 207 for dev_id in `cat $TMP`; do 208 for bdf in $(iter_pci_dev_id 8086 $dev_id); do 209 if pci_can_bind $bdf == "0" ; then 210 echo "Skipping un-whitelisted I/OAT device at $bdf" 211 continue 212 fi 213 214 linux_bind_driver "$bdf" "$driver_name" 215 done 216 done 217 rm $TMP 218 219 # virtio 220 TMP=`mktemp` 221 #collect all the device_id info of virtio devices. 222 grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \ 223 | awk -F"x" '{print $2}' > $TMP 224 225 for dev_id in `cat $TMP`; do 226 for bdf in $(iter_pci_dev_id 1af4 $dev_id); do 227 if pci_can_bind $bdf == "0" ; then 228 echo "Skipping un-whitelisted Virtio device at $bdf" 229 continue 230 fi 231 blknames='' 232 get_virtio_names_from_bdf "$bdf" blknames 233 for blkname in $blknames; do 234 if mount | grep -q "/dev/$blkname"; then 235 echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf 236 continue 2 237 fi 238 done 239 240 linux_bind_driver "$bdf" "$driver_name" 241 done 242 done 243 rm $TMP 244 245 echo "1" > "/sys/bus/pci/rescan" 246} 247 248function cleanup_linux { 249 shopt -s extglob nullglob 250 dirs_to_clean="" 251 dirs_to_clean="$(echo {/var/run,/tmp}/dpdk/spdk{,_pid}+([0-9])) " 252 if [[ -d $XDG_RUNTIME_DIR && $XDG_RUNTIME_DIR != *" "* ]]; then 253 dirs_to_clean+="$(readlink -e assert_not_empty $XDG_RUNTIME_DIR/dpdk/spdk{,_pid}+([0-9]) || true) " 254 fi 255 256 files_to_clean="" 257 for dir in $dirs_to_clean; do 258 files_to_clean+="$(echo $dir/*) " 259 done 260 shopt -u extglob nullglob 261 262 files_to_clean+="$(echo /dev/shm/* | egrep '(spdk_tgt|iscsi|vhost|nvmf|rocksdb|bdevtest|bdevperf)_trace|spdk_iscsi_conns' || true) " 263 files_to_clean="$(readlink -e assert_not_empty $files_to_clean || true)" 264 if [[ -z "$files_to_clean" ]]; then 265 echo "Clean" 266 return 0; 267 fi 268 269 shopt -s extglob 270 for fd_dir in $(echo /proc/+([0-9])); do 271 opened_files+="$(readlink -e assert_not_empty $fd_dir/fd/* || true)" 272 done 273 shopt -u extglob 274 275 if [[ -z "$opened_files" ]]; then 276 echo "Can't get list of opened files!" 277 exit 1 278 fi 279 280 echo 'Cleaning' 281 for f in $files_to_clean; do 282 if ! echo "$opened_files" | egrep -q "^$f\$"; then 283 echo "Removing: $f" 284 rm $f 285 else 286 echo "Still open: $f" 287 fi 288 done 289 290 for dir in $dirs_to_clean; do 291 if ! echo "$opened_files" | egrep -q "^$dir\$"; then 292 echo "Removing: $dir" 293 rmdir $dir 294 else 295 echo "Still open: $dir" 296 fi 297 done 298 echo "Clean" 299 300 unset dirs_to_clean files_to_clean opened_files 301} 302 303function configure_linux { 304 configure_linux_pci 305 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 306 307 if [ -z "$hugetlbfs_mounts" ]; then 308 hugetlbfs_mounts=/mnt/huge 309 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 310 mkdir -p "$hugetlbfs_mounts" 311 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 312 fi 313 314 if [ -z "$HUGENODE" ]; then 315 hugepages_target="/proc/sys/vm/nr_hugepages" 316 else 317 hugepages_target="/sys/devices/system/node/node${HUGENODE}/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages" 318 fi 319 320 echo "$NRHUGE" > "$hugepages_target" 321 allocated_hugepages=`cat $hugepages_target` 322 if [ "$allocated_hugepages" -lt "$NRHUGE" ]; then 323 echo "" 324 echo "## ERROR: requested $NRHUGE hugepages but only $allocated_hugepages could be allocated." 325 echo "## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine." 326 exit 1 327 fi 328 329 if [ "$driver_name" = "vfio-pci" ]; then 330 if [ -n "$TARGET_USER" ]; then 331 for mount in $hugetlbfs_mounts; do 332 chown "$TARGET_USER" "$mount" 333 chmod g+w "$mount" 334 done 335 fi 336 337 MEMLOCK_AMNT=`ulimit -l` 338 if [ "$MEMLOCK_AMNT" != "unlimited" ] ; then 339 MEMLOCK_MB=$(( $MEMLOCK_AMNT / 1024 )) 340 echo "" 341 echo "Current user memlock limit: ${MEMLOCK_MB} MB" 342 echo "" 343 echo "This is the maximum amount of memory you will be" 344 echo "able to use with DPDK and VFIO if run as current user." 345 echo -n "To change this, please adjust limits.conf memlock " 346 echo "limit for current user." 347 348 if [ $MEMLOCK_AMNT -lt 65536 ] ; then 349 echo "" 350 echo "## WARNING: memlock limit is less than 64MB" 351 echo -n "## DPDK with VFIO may not be able to initialize " 352 echo "if run as current user." 353 fi 354 fi 355 fi 356} 357 358function reset_linux_pci { 359 # NVMe 360 set +e 361 check_for_driver nvme 362 driver_loaded=$? 363 set -e 364 for bdf in $(iter_pci_class_code 01 08 02); do 365 if pci_can_bind $bdf == "0" ; then 366 echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)" 367 continue 368 fi 369 if [ $driver_loaded -ne 0 ]; then 370 linux_bind_driver "$bdf" nvme 371 else 372 linux_unbind_driver "$bdf" 373 fi 374 done 375 376 # IOAT 377 TMP=`mktemp` 378 #collect all the device_id info of ioat devices. 379 grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ 380 | awk -F"x" '{print $2}' > $TMP 381 382 set +e 383 check_for_driver ioatdma 384 driver_loaded=$? 385 set -e 386 for dev_id in `cat $TMP`; do 387 for bdf in $(iter_pci_dev_id 8086 $dev_id); do 388 if pci_can_bind $bdf == "0" ; then 389 echo "Skipping un-whitelisted I/OAT device at $bdf" 390 continue 391 fi 392 if [ $driver_loaded -ne 0 ]; then 393 linux_bind_driver "$bdf" ioatdma 394 else 395 linux_unbind_driver "$bdf" 396 fi 397 done 398 done 399 rm $TMP 400 401 # virtio 402 TMP=`mktemp` 403 #collect all the device_id info of virtio devices. 404 grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \ 405 | awk -F"x" '{print $2}' > $TMP 406 407 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 408 # Requires some more investigation - for example, some kernels do not seem to have 409 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 410 # underscore vs. dash right in the virtio_scsi name. 411 modprobe virtio-pci || true 412 for dev_id in `cat $TMP`; do 413 for bdf in $(iter_pci_dev_id 1af4 $dev_id); do 414 if pci_can_bind $bdf == "0" ; then 415 echo "Skipping un-whitelisted Virtio device at $bdf" 416 continue 417 fi 418 linux_bind_driver "$bdf" virtio-pci 419 done 420 done 421 rm $TMP 422 423 echo "1" > "/sys/bus/pci/rescan" 424} 425 426function reset_linux { 427 reset_linux_pci 428 for mount in $(linux_hugetlbfs_mounts); do 429 rm -f "$mount"/spdk*map_* 430 done 431 rm -f /run/.spdk* 432} 433 434function status_linux { 435 echo "Hugepages" 436 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" 437 438 numa_nodes=0 439 shopt -s nullglob 440 for path in /sys/devices/system/node/node?/hugepages/hugepages-*/; do 441 numa_nodes=$((numa_nodes + 1)) 442 free_pages=`cat $path/free_hugepages` 443 all_pages=`cat $path/nr_hugepages` 444 445 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 446 447 node=${BASH_REMATCH[1]} 448 huge_size=${BASH_REMATCH[2]} 449 450 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 451 done 452 shopt -u nullglob 453 454 # fall back to system-wide hugepages 455 if [ "$numa_nodes" = "0" ]; then 456 free_pages=`grep HugePages_Free /proc/meminfo | awk '{ print $2 }'` 457 all_pages=`grep HugePages_Total /proc/meminfo | awk '{ print $2 }'` 458 node="-" 459 huge_size="$HUGEPGSZ" 460 461 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 462 fi 463 464 echo "NVMe devices" 465 466 echo -e "BDF\t\tNuma Node\tDriver name\t\tDevice name" 467 for bdf in $(iter_pci_class_code 01 08 02); do 468 driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'` 469 node=`cat /sys/bus/pci/devices/$bdf/numa_node`; 470 if [ "$driver" = "nvme" -a -d /sys/bus/pci/devices/$bdf/nvme ]; then 471 name="\t"`ls /sys/bus/pci/devices/$bdf/nvme`; 472 else 473 name="-"; 474 fi 475 echo -e "$bdf\t$node\t\t$driver\t\t$name"; 476 done 477 478 echo "I/OAT DMA" 479 480 #collect all the device_id info of ioat devices. 481 TMP=`grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ 482 | awk -F"x" '{print $2}'` 483 echo -e "BDF\t\tNuma Node\tDriver Name" 484 for dev_id in $TMP; do 485 for bdf in $(iter_pci_dev_id 8086 $dev_id); do 486 driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'` 487 node=`cat /sys/bus/pci/devices/$bdf/numa_node`; 488 echo -e "$bdf\t$node\t\t$driver" 489 done 490 done 491 492 echo "virtio" 493 494 #collect all the device_id info of virtio devices. 495 TMP=`grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \ 496 | awk -F"x" '{print $2}'` 497 echo -e "BDF\t\tNuma Node\tDriver Name\t\tDevice Name" 498 for dev_id in $TMP; do 499 for bdf in $(iter_pci_dev_id 1af4 $dev_id); do 500 driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'` 501 node=`cat /sys/bus/pci/devices/$bdf/numa_node`; 502 blknames='' 503 get_virtio_names_from_bdf "$bdf" blknames 504 echo -e "$bdf\t$node\t\t$driver\t\t$blknames" 505 done 506 done 507} 508 509function configure_freebsd_pci { 510 TMP=`mktemp` 511 512 # NVMe 513 GREP_STR="class=0x010802" 514 515 # IOAT 516 grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ 517 | awk -F"x" '{print $2}' > $TMP 518 for dev_id in `cat $TMP`; do 519 GREP_STR="${GREP_STR}\|chip=0x${dev_id}8086" 520 done 521 522 AWK_PROG="{if (count > 0) printf \",\"; printf \"%s:%s:%s\",\$2,\$3,\$4; count++}" 523 echo $AWK_PROG > $TMP 524 525 BDFS=`pciconf -l | grep "${GREP_STR}" | awk -F: -f $TMP` 526 527 kldunload nic_uio.ko || true 528 kenv hw.nic_uio.bdfs=$BDFS 529 kldload nic_uio.ko 530 rm $TMP 531} 532 533function configure_freebsd { 534 configure_freebsd_pci 535 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 536 # previous value, unload contigmem so that we can reload with the new value. 537 if kldstat -q -m contigmem; then 538 if [ `kenv hw.contigmem.num_buffers` -ne "$((HUGEMEM / 256))" ]; then 539 kldunload contigmem.ko 540 fi 541 fi 542 if ! kldstat -q -m contigmem; then 543 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 544 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 545 kldload contigmem.ko 546 fi 547} 548 549function reset_freebsd { 550 kldunload contigmem.ko || true 551 kldunload nic_uio.ko || true 552} 553 554mode=$1 555 556if [ -z "$mode" ]; then 557 mode="config" 558fi 559 560: ${HUGEMEM:=2048} 561: ${PCI_WHITELIST:=""} 562 563if [ -n "$NVME_WHITELIST" ]; then 564 PCI_WHITELIST="$PCI_WHITELIST $NVME_WHITELIST" 565fi 566 567if [ -n "$SKIP_PCI" ]; then 568 PCI_WHITELIST="none" 569fi 570 571declare -a PCI_WHITELIST=(${PCI_WHITELIST}) 572 573if [ -z "$TARGET_USER" ]; then 574 TARGET_USER="$SUDO_USER" 575 if [ -z "$TARGET_USER" ]; then 576 TARGET_USER=`logname 2>/dev/null` || true 577 fi 578fi 579 580if [ `uname` = Linux ]; then 581 HUGEPGSZ=$(( `grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9'` )) 582 HUGEPGSZ_MB=$(( $HUGEPGSZ / 1024 )) 583 : ${NRHUGE=$(( (HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB ))} 584 585 if [ "$mode" == "config" ]; then 586 configure_linux 587 elif [ "$mode" == "cleanup" ]; then 588 cleanup_linux 589 elif [ "$mode" == "reset" ]; then 590 reset_linux 591 elif [ "$mode" == "status" ]; then 592 status_linux 593 elif [ "$mode" == "help" ]; then 594 usage $0 595 else 596 usage $0 "Invalid argument '$mode'" 597 fi 598else 599 if [ "$mode" == "config" ]; then 600 configure_freebsd 601 elif [ "$mode" == "reset" ]; then 602 reset_freebsd 603 elif [ "$mode" == "cleanup" ]; then 604 echo "setup.sh cleanup function not yet supported on $(uname)" 605 elif [ "$mode" == "status" ]; then 606 echo "setup.sh status function not yet supported on $(uname)" 607 elif [ "$mode" == "help" ]; then 608 usage $0 609 else 610 usage $0 "Invalid argument '$mode'" 611 fi 612fi 613