1#!/usr/bin/env bash 2 3set -e 4 5rootdir=$(readlink -f $(dirname $0))/.. 6source "$rootdir/scripts/common.sh" 7 8function usage() 9{ 10 if [ `uname` = Linux ]; then 11 options="[config|reset|status|help]" 12 else 13 options="[config|reset|help]" 14 fi 15 16 [[ ! -z $2 ]] && ( echo "$2"; echo ""; ) 17 echo "Helper script for allocating hugepages and binding NVMe, I/OAT and Virtio devices to" 18 echo "a generic VFIO kernel driver. If VFIO is not available on the system, this script will" 19 echo "fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored." 20 echo "All hugepage operations use default hugepage size on the system (hugepagesz)." 21 echo "Usage: $(basename $1) $options" 22 echo 23 echo "$options - as following:" 24 echo "config Default mode. Allocate hugepages and bind PCI devices." 25 echo "reset Rebind PCI devices back to their original drivers." 26 echo " Also cleanup any leftover spdk files/resources." 27 echo " Hugepage memory size will remain unchanged." 28 if [ `uname` = Linux ]; then 29 echo "status Print status of all SPDK-compatible devices on the system." 30 fi 31 echo "help Print this help message." 32 echo 33 echo "The following environment variables can be specified." 34 echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." 35 echo " For NUMA systems, the hugepages will be evenly distributed" 36 echo " between CPU nodes" 37 echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." 38 echo "HUGENODE Specific NUMA node to allocate hugepages on. To allocate" 39 echo " hugepages on multiple nodes run this script multiple times -" 40 echo " once for each node." 41 echo "PCI_WHITELIST Whitespace separated list of PCI devices (NVMe, I/OAT, Virtio) to bind." 42 echo " Each device must be specified as a full PCI address." 43 echo " E.g. PCI_WHITELIST=\"0000:01:00.0 0000:02:00.0\"" 44 echo " To blacklist all PCI devices use a non-valid address." 45 echo " E.g. PCI_WHITELIST=\"none\"" 46 echo " If empty or unset, all PCI devices will be bound." 47 echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups." 48 echo " By default the current user will be used." 49 exit 0 50} 51 52# In monolithic kernels the lsmod won't work. So 53# back that with a /sys/modules check. Return a different code for 54# built-in vs module just in case we want that down the road. 55function check_for_driver { 56 $(lsmod | grep $1 > /dev/null) 57 if [ $? -eq 0 ]; then 58 return 1 59 else 60 if [[ -d /sys/module/$1 ]]; then 61 return 2 62 else 63 return 0 64 fi 65 fi 66 return 0 67} 68 69function pci_can_bind() { 70 if [[ ${#PCI_WHITELIST[@]} == 0 ]]; then 71 #no whitelist specified, bind all devices 72 return 1 73 fi 74 75 for i in ${PCI_WHITELIST[@]} 76 do 77 if [ "$i" == "$1" ] ; then 78 return 1 79 fi 80 done 81 return 0 82} 83 84function linux_bind_driver() { 85 bdf="$1" 86 driver_name="$2" 87 old_driver_name="no driver" 88 ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /') 89 90 if [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then 91 old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver)) 92 93 if [ "$driver_name" = "$old_driver_name" ]; then 94 return 0 95 fi 96 97 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 98 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 99 fi 100 101 echo "$bdf ($ven_dev_id): $old_driver_name -> $driver_name" 102 103 echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true 104 echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true 105 106 iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group)) 107 if [ -e "/dev/vfio/$iommu_group" ]; then 108 if [ -n "$TARGET_USER" ]; then 109 chown "$TARGET_USER" "/dev/vfio/$iommu_group" 110 fi 111 fi 112} 113 114function linux_unbind_driver() { 115 bdf="$1" 116 ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /') 117 118 if ! [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then 119 return 0 120 fi 121 122 old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver)) 123 124 echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true 125 echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind" 126 echo "$bdf ($ven_dev_id): $old_driver_name -> no driver" 127} 128 129function linux_hugetlbfs_mounts() { 130 mount | grep ' type hugetlbfs ' | awk '{ print $3 }' 131} 132 133function get_nvme_name_from_bdf { 134 set +e 135 nvme_devs=`lsblk -d --output NAME | grep "^nvme"` 136 set -e 137 for dev in $nvme_devs; do 138 link_name=$(readlink /sys/block/$dev/device/device) || true 139 if [ -z "$link_name" ]; then 140 link_name=$(readlink /sys/block/$dev/device) 141 fi 142 link_bdf=$(basename "$link_name") 143 if [ "$link_bdf" = "$1" ]; then 144 eval "$2=$dev" 145 return 146 fi 147 done 148} 149 150function get_virtio_names_from_bdf { 151 blk_devs=`lsblk --nodeps --output NAME` 152 virtio_names='' 153 154 for dev in $blk_devs; do 155 if readlink "/sys/block/$dev" | grep -q "$1"; then 156 virtio_names="$virtio_names $dev" 157 fi 158 done 159 160 eval "$2='$virtio_names'" 161} 162 163function configure_linux_pci { 164 driver_name=vfio-pci 165 if [ -z "$(ls /sys/kernel/iommu_groups)" ]; then 166 # No IOMMU. Use uio. 167 driver_name=uio_pci_generic 168 fi 169 170 # NVMe 171 modprobe $driver_name || true 172 for bdf in $(iter_pci_class_code 01 08 02); do 173 blkname='' 174 get_nvme_name_from_bdf "$bdf" blkname 175 if pci_can_bind $bdf == "0" ; then 176 echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)" 177 continue 178 fi 179 if [ "$blkname" != "" ]; then 180 mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w) 181 else 182 mountpoints="0" 183 fi 184 if [ "$mountpoints" = "0" ]; then 185 linux_bind_driver "$bdf" "$driver_name" 186 else 187 echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf 188 fi 189 done 190 191 # IOAT 192 TMP=`mktemp` 193 #collect all the device_id info of ioat devices. 194 grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ 195 | awk -F"x" '{print $2}' > $TMP 196 197 for dev_id in `cat $TMP`; do 198 for bdf in $(iter_pci_dev_id 8086 $dev_id); do 199 if pci_can_bind $bdf == "0" ; then 200 echo "Skipping un-whitelisted I/OAT device at $bdf" 201 continue 202 fi 203 linux_bind_driver "$bdf" "$driver_name" 204 done 205 done 206 rm $TMP 207 208 # virtio 209 TMP=`mktemp` 210 #collect all the device_id info of virtio devices. 211 grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \ 212 | awk -F"x" '{print $2}' > $TMP 213 214 for dev_id in `cat $TMP`; do 215 for bdf in $(iter_pci_dev_id 1af4 $dev_id); do 216 if pci_can_bind $bdf == "0" ; then 217 echo "Skipping un-whitelisted Virtio device at $bdf" 218 continue 219 fi 220 blknames='' 221 get_virtio_names_from_bdf "$bdf" blknames 222 for blkname in $blknames; do 223 if mount | grep -q "/dev/$blkname"; then 224 echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf 225 continue 2 226 fi 227 done 228 229 linux_bind_driver "$bdf" "$driver_name" 230 done 231 done 232 rm $TMP 233 234 echo "1" > "/sys/bus/pci/rescan" 235} 236 237function configure_linux { 238 configure_linux_pci 239 hugetlbfs_mounts=$(linux_hugetlbfs_mounts) 240 241 if [ -z "$hugetlbfs_mounts" ]; then 242 hugetlbfs_mounts=/mnt/huge 243 echo "Mounting hugetlbfs at $hugetlbfs_mounts" 244 mkdir -p "$hugetlbfs_mounts" 245 mount -t hugetlbfs nodev "$hugetlbfs_mounts" 246 fi 247 248 if [ -z "$HUGENODE" ]; then 249 hugepages_target="/proc/sys/vm/nr_hugepages" 250 else 251 hugepages_target="/sys/devices/system/node/node${HUGENODE}/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages" 252 fi 253 254 echo "$NRHUGE" > "$hugepages_target" 255 allocated_hugepages=`cat $hugepages_target` 256 if [ "$allocated_hugepages" -lt "$NRHUGE" ]; then 257 echo "" 258 echo "## ERROR: requested $NRHUGE hugepages but only $allocated_hugepages could be allocated." 259 echo "## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine." 260 exit 1 261 fi 262 263 if [ "$driver_name" = "vfio-pci" ]; then 264 if [ -n "$TARGET_USER" ]; then 265 for mount in $hugetlbfs_mounts; do 266 chown "$TARGET_USER" "$mount" 267 chmod g+w "$mount" 268 done 269 fi 270 271 MEMLOCK_AMNT=`ulimit -l` 272 if [ "$MEMLOCK_AMNT" != "unlimited" ] ; then 273 MEMLOCK_MB=$(( $MEMLOCK_AMNT / 1024 )) 274 echo "" 275 echo "Current user memlock limit: ${MEMLOCK_MB} MB" 276 echo "" 277 echo "This is the maximum amount of memory you will be" 278 echo "able to use with DPDK and VFIO if run as current user." 279 echo -n "To change this, please adjust limits.conf memlock " 280 echo "limit for current user." 281 282 if [ $MEMLOCK_AMNT -lt 65536 ] ; then 283 echo "" 284 echo "## WARNING: memlock limit is less than 64MB" 285 echo -n "## DPDK with VFIO may not be able to initialize " 286 echo "if run as current user." 287 fi 288 fi 289 fi 290} 291 292function reset_linux_pci { 293 # NVMe 294 set +e 295 check_for_driver nvme 296 driver_loaded=$? 297 set -e 298 for bdf in $(iter_pci_class_code 01 08 02); do 299 if pci_can_bind $bdf == "0" ; then 300 echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)" 301 continue 302 fi 303 if [ $driver_loaded -ne 0 ]; then 304 linux_bind_driver "$bdf" nvme 305 else 306 linux_unbind_driver "$bdf" 307 fi 308 done 309 310 # IOAT 311 TMP=`mktemp` 312 #collect all the device_id info of ioat devices. 313 grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ 314 | awk -F"x" '{print $2}' > $TMP 315 316 set +e 317 check_for_driver ioatdma 318 driver_loaded=$? 319 set -e 320 for dev_id in `cat $TMP`; do 321 for bdf in $(iter_pci_dev_id 8086 $dev_id); do 322 if pci_can_bind $bdf == "0" ; then 323 echo "Skipping un-whitelisted I/OAT device at $bdf" 324 continue 325 fi 326 if [ $driver_loaded -ne 0 ]; then 327 linux_bind_driver "$bdf" ioatdma 328 else 329 linux_unbind_driver "$bdf" 330 fi 331 done 332 done 333 rm $TMP 334 335 # virtio 336 TMP=`mktemp` 337 #collect all the device_id info of virtio devices. 338 grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \ 339 | awk -F"x" '{print $2}' > $TMP 340 341 # TODO: check if virtio-pci is loaded first and just unbind if it is not loaded 342 # Requires some more investigation - for example, some kernels do not seem to have 343 # virtio-pci but just virtio_scsi instead. Also need to make sure we get the 344 # underscore vs. dash right in the virtio_scsi name. 345 modprobe virtio-pci || true 346 for dev_id in `cat $TMP`; do 347 for bdf in $(iter_pci_dev_id 1af4 $dev_id); do 348 if pci_can_bind $bdf == "0" ; then 349 echo "Skipping un-whitelisted Virtio device at $bdf" 350 continue 351 fi 352 linux_bind_driver "$bdf" virtio-pci 353 done 354 done 355 rm $TMP 356 357 echo "1" > "/sys/bus/pci/rescan" 358} 359 360function reset_linux { 361 reset_linux_pci 362 for mount in $(linux_hugetlbfs_mounts); do 363 rm -f "$mount"/spdk*map_* 364 done 365 rm -f /run/.spdk* 366} 367 368function status_linux { 369 echo "Hugepages" 370 printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total" 371 372 numa_nodes=0 373 shopt -s nullglob 374 for path in /sys/devices/system/node/node?/hugepages/hugepages-*/; do 375 numa_nodes=$((numa_nodes + 1)) 376 free_pages=`cat $path/free_hugepages` 377 all_pages=`cat $path/nr_hugepages` 378 379 [[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]] 380 381 node=${BASH_REMATCH[1]} 382 huge_size=${BASH_REMATCH[2]} 383 384 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 385 done 386 shopt -u nullglob 387 388 # fall back to system-wide hugepages 389 if [ "$numa_nodes" = "0" ]; then 390 free_pages=`grep HugePages_Free /proc/meminfo | awk '{ print $2 }'` 391 all_pages=`grep HugePages_Total /proc/meminfo | awk '{ print $2 }'` 392 node="-" 393 huge_size="$HUGEPGSZ" 394 395 printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages 396 fi 397 398 echo "NVMe devices" 399 400 echo -e "BDF\t\tNuma Node\tDriver name\t\tDevice name" 401 for bdf in $(iter_pci_class_code 01 08 02); do 402 driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'` 403 node=`cat /sys/bus/pci/devices/$bdf/numa_node`; 404 if [ "$driver" = "nvme" -a -d /sys/bus/pci/devices/$bdf/nvme ]; then 405 name="\t"`ls /sys/bus/pci/devices/$bdf/nvme`; 406 else 407 name="-"; 408 fi 409 echo -e "$bdf\t$node\t\t$driver\t\t$name"; 410 done 411 412 echo "I/OAT DMA" 413 414 #collect all the device_id info of ioat devices. 415 TMP=`grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ 416 | awk -F"x" '{print $2}'` 417 echo -e "BDF\t\tNuma Node\tDriver Name" 418 for dev_id in $TMP; do 419 for bdf in $(iter_pci_dev_id 8086 $dev_id); do 420 driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'` 421 node=`cat /sys/bus/pci/devices/$bdf/numa_node`; 422 echo -e "$bdf\t$node\t\t$driver" 423 done 424 done 425 426 echo "virtio" 427 428 #collect all the device_id info of virtio devices. 429 TMP=`grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \ 430 | awk -F"x" '{print $2}'` 431 echo -e "BDF\t\tNuma Node\tDriver Name\t\tDevice Name" 432 for dev_id in $TMP; do 433 for bdf in $(iter_pci_dev_id 1af4 $dev_id); do 434 driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'` 435 node=`cat /sys/bus/pci/devices/$bdf/numa_node`; 436 blknames='' 437 get_virtio_names_from_bdf "$bdf" blknames 438 echo -e "$bdf\t$node\t\t$driver\t\t$blknames" 439 done 440 done 441} 442 443function configure_freebsd_pci { 444 TMP=`mktemp` 445 446 # NVMe 447 GREP_STR="class=0x010802" 448 449 # IOAT 450 grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \ 451 | awk -F"x" '{print $2}' > $TMP 452 for dev_id in `cat $TMP`; do 453 GREP_STR="${GREP_STR}\|chip=0x${dev_id}8086" 454 done 455 456 AWK_PROG="{if (count > 0) printf \",\"; printf \"%s:%s:%s\",\$2,\$3,\$4; count++}" 457 echo $AWK_PROG > $TMP 458 459 BDFS=`pciconf -l | grep "${GREP_STR}" | awk -F: -f $TMP` 460 461 kldunload nic_uio.ko || true 462 kenv hw.nic_uio.bdfs=$BDFS 463 kldload nic_uio.ko 464 rm $TMP 465} 466 467function configure_freebsd { 468 configure_freebsd_pci 469 # If contigmem is already loaded but the HUGEMEM specified doesn't match the 470 # previous value, unload contigmem so that we can reload with the new value. 471 if kldstat -q -m contigmem; then 472 if [ `kenv hw.contigmem.num_buffers` -ne "$((HUGEMEM / 256))" ]; then 473 kldunload contigmem.ko 474 fi 475 fi 476 if ! kldstat -q -m contigmem; then 477 kenv hw.contigmem.num_buffers=$((HUGEMEM / 256)) 478 kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024)) 479 kldload contigmem.ko 480 fi 481} 482 483function reset_freebsd { 484 kldunload contigmem.ko || true 485 kldunload nic_uio.ko || true 486} 487 488mode=$1 489 490if [ -z "$mode" ]; then 491 mode="config" 492fi 493 494: ${HUGEMEM:=2048} 495: ${PCI_WHITELIST:=""} 496 497if [ -n "$NVME_WHITELIST" ]; then 498 PCI_WHITELIST="$PCI_WHITELIST $NVME_WHITELIST" 499fi 500 501if [ -n "$SKIP_PCI" ]; then 502 PCI_WHITELIST="none" 503fi 504 505declare -a PCI_WHITELIST=(${PCI_WHITELIST}) 506 507if [ -z "$TARGET_USER" ]; then 508 TARGET_USER="$SUDO_USER" 509 if [ -z "$TARGET_USER" ]; then 510 TARGET_USER=`logname 2>/dev/null` || true 511 fi 512fi 513 514if [ `uname` = Linux ]; then 515 HUGEPGSZ=$(( `grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9'` )) 516 HUGEPGSZ_MB=$(( $HUGEPGSZ / 1024 )) 517 : ${NRHUGE=$(( (HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB ))} 518 519 if [ "$mode" == "config" ]; then 520 configure_linux 521 elif [ "$mode" == "reset" ]; then 522 reset_linux 523 elif [ "$mode" == "status" ]; then 524 status_linux 525 elif [ "$mode" == "help" ]; then 526 usage $0 527 else 528 usage $0 "Invalid argument '$mode'" 529 fi 530else 531 if [ "$mode" == "config" ]; then 532 configure_freebsd 533 elif [ "$mode" == "reset" ]; then 534 reset_freebsd 535 elif [ "$mode" == "help" ]; then 536 usage $0 537 else 538 usage $0 "Invalid argument '$mode'" 539 fi 540fi 541