1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2022 Intel Corporation. 4# All rights reserved. 5 6curdir=$(readlink -f "$(dirname "$0")") 7rootdir=$(readlink -f "$curdir/../../../") 8 9shopt -s nullglob extglob 10 11source "$rootdir/scripts/common.sh" 12source "$rootdir/test/scheduler/common.sh" 13 14get_auto_cfg() { 15 local vm_cpus vm_node vm vms 16 local cpu node nodes_idxs node_idx 17 local nvmes nvme nvme_idx nvme_diff nvmes_per_node 18 local vm_diff aligned_number_of_vms=0 19 local diff iter 20 21 local -g auto_cpu_map=() auto_disk_map=() spdk=() 22 23 map_cpus 24 get_nvme_numa_map 25 26 nodes_idxs=("${!nodes[@]}") 27 28 # Construct initial NUMA-aware setup by pinning VM to given nvme's node. First run is meant 29 # to pin enough number of VMs (as per vm_count) to match the number of available nvme ctrls. 30 vm=0 31 for node in "${nodes_idxs[@]}"; do 32 nvmes=(${!nvme_numa_map[node]}) 33 for ((nvme_idx = 0; nvme_idx < ${#nvmes[@]} && vm < vm_count; vm++, nvme_idx++)); do 34 eval "vm${vm}_node=$node" 35 done 36 nvmes_per_node[node]=${#nvmes[@]} 37 done 38 39 vm_diff=$((vm_count - vm)) 40 41 # Align extra number of VMs in case nvme ctrls are not distributed evenly across the existing 42 # NUMA nodes. 43 # FIXME: This is targeted for systems with only 2 NUMA nodes. Technically, kernel supports 44 # more than that - it's possible to achieve setups with > 2 NUMA nodes under virtual env 45 # for instance. Should this be of any concern? 46 if ((nvmes_per_node[0] < nvmes_per_node[1])); then 47 nvme_diff=$((nvmes_per_node[1] - nvmes_per_node[0])) 48 elif ((nvmes_per_node[0] > nvmes_per_node[1])); then 49 nvme_diff=$((nvmes_per_node[0] - nvmes_per_node[1])) 50 else 51 nvme_diff=0 52 fi 53 54 diff=$((vm_diff + nvme_diff)) 55 56 if ((diff % 2 == 0)); then 57 aligned_number_of_vms=$((diff / ${#nodes_idxs[@]})) 58 fi 59 60 # Second run distributes extra VMs across existing NUMA nodes. In case we can distribute even 61 # number of extra VMs (as per vm_count) then simply assign them in bulk. In case there's an 62 # odd number, do some simple rr balancing where we assign them one by one - first to node0, 63 # second to node1, third to node0, etc. 64 if ((aligned_number_of_vms)); then 65 for node in "${nodes_idxs[@]}"; do 66 for ((iter = 0; iter < aligned_number_of_vms && vm < vm_count; iter++, vm++)); do 67 eval "vm${vm}_node=$node" 68 done 69 done 70 else 71 while ((vm < vm_count)); do 72 for node in "${nodes_idxs[@]}"; do 73 eval "vm${vm}_node=$node" 74 ((++vm)) 75 done 76 done 77 fi 78 79 local -g vm_numa_map=() 80 for ((vm = 0; vm < vm_count; vm++)); do 81 # Load balance the cpus across available numa nodes based on the pinning 82 # done prior. If there are no cpus left under selected node, iterate over 83 # all available nodes. If no cpus are left, fail. We don't allow to mix 84 # cpus from different nodes for the sake of the performance. 85 node_idx=0 node_idx_perc=0 86 eval "vm_node=\${VM${vm}_NODE:-\$vm${vm}_node}" 87 88 local -n node_cpus=node_${vm_node}_cpu 89 local -n vm_nodes=node_${vm_node}_vm 90 91 vm_numa_map[vm_node]="node_${vm_node}_vm[@]" 92 93 while ((${#node_cpus[@]} < vm_cpu_num && node_idx < ${#nodes_idxs[@]})); do 94 vm_node=${nodes_idxs[node_idx]} 95 local -n node_cpus=node_${nodes_idxs[node_idx++]}_cpu 96 done 97 98 if ((${#node_cpus[@]} < vm_cpu_num)); then 99 printf 'Not enough CPUs available for VM %u (CPUs: %u, Nodes: %u, CPUs per VM: %u)\n' \ 100 "$vm" "${#cpus[@]}" "${#nodes_idxs[@]}" "$vm_cpu_num" >&2 101 return 1 102 fi 103 104 # Normalize indexes 105 node_cpus=("${node_cpus[@]}") 106 107 vm_cpus=("${node_cpus[@]::vm_cpu_num}") 108 node_cpus=("${node_cpus[@]:vm_cpu_num}") 109 110 auto_cpu_map+=("$( 111 cat <<- CPU_VM 112 VM_${vm}_qemu_mask=$( 113 IFS="," 114 echo "${vm_cpus[*]}" 115 ) 116 VM_${vm}_qemu_numa_node=$vm_node 117 CPU_VM 118 )") 119 120 # Save map of each VM->NUMA node to be able to construct a disk map in later steps. 121 vm_nodes+=("$vm") 122 done 123 124 # auto_cpu_map is ready, all requested VMs should be balanced across all NUMA nodes 125 # making sure each nvme drive will be bound to at least 1 VM placed on the 126 # corresponding NUMA node. Now, construct disk_cfg and assign VMs, with proper 127 # split value, to each nvme - extra VMs will be added to nvme drives in their 128 # bus order. 129 local -A nvme_vm_map=() 130 local iter nvmes_no=0 vms_no=0 _vms_per_nvme 131 for node in "${nodes_idxs[@]}"; do 132 if [[ ! -v nvme_numa_map[node] ]]; then 133 # There are no drives available on that node, skip it 134 continue 135 fi 136 nvmes=(${!nvme_numa_map[node]}) nvmes_no=${#nvmes[@]} 137 vms=(${!vm_numa_map[node]}) vms_no=${#vms[@]} 138 for ((iter = 0; iter <= (vms_no - nvmes_no <= 0 ? 1 : vms_no - nvmes_no); iter++)); do 139 for nvme in "${nvmes[@]}"; do 140 _vms_per_nvme=0 141 if ((${#vms[@]} == 0)); then 142 # No VMs on given node or they have been exhausted - skip all remaining drives. 143 continue 3 144 fi 145 nvme_vm_map["$nvme"]="_${nvme//[:.]/_}_[@]" 146 local -n nvme_vms=_${nvme//[:.]/_}_ 147 while ((++_vms_per_nvme <= vms_per_nvme)); do 148 nvme_vms+=("${vms[0]}") vms=("${vms[@]:1}") 149 done 150 done 151 done 152 done 153 154 local sorted_nvmes=() 155 sorted_nvmes=($(printf '%s\n' "${!nvme_vm_map[@]}" | sort)) 156 for nvme in "${!sorted_nvmes[@]}"; do 157 vms=(${!nvme_vm_map["${sorted_nvmes[nvme]}"]}) 158 auto_disk_map+=("${sorted_nvmes[nvme]},Nvme$((nvme++)),${#vms[*]},${vms[*]}") 159 done 160 161 get_spdk_cpus || return 1 162 163 auto_cpu_map+=("vhost_0_reactor_mask=[$( 164 IFS="," 165 echo "${spdk[*]}" 166 )]") 167 auto_cpu_map+=("vhost_0_main_core=${spdk[0]}") 168} 169 170get_nvme_numa_map() { 171 local nvmes nvme node 172 local -g nvme_numa_map=() 173 174 cache_pci_bus 175 176 for nvme in ${pci_bus_cache[0x010802]}; do 177 node=$(< "/sys/bus/pci/devices/$nvme/numa_node") 178 nvme_numa_map[node]="node_${node}_nvme[@]" 179 local -n node_nvmes=node_${node}_nvme 180 node_nvmes+=("$nvme") 181 done 182} 183 184get_spdk_cpus() { 185 local -g spdk=() 186 local node vms perc 187 local cpus_per_node cpus_exhausted=() cpus_remained=() 188 189 if [[ -z $spdk_cpu_num ]]; then 190 spdk=(0) 191 return 0 192 fi 193 194 if [[ -n $spdk_cpu_list ]]; then 195 spdk=($(parse_cpu_list <(echo "$spdk_cpu_list"))) 196 return 0 197 fi 198 199 # Start allocating from NUMA node with greater number of pinned VMs. 200 node_sort=($(for node in "${!vm_numa_map[@]}"; do 201 vms=(${!vm_numa_map[node]}) 202 echo "${#vms[@]}:$node" 203 done | sort -rn)) 204 205 for _node in "${node_sort[@]}"; do 206 node=${_node#*:} vms=${_node%:*} 207 local -n node_all_cpus=node_${node}_cpu 208 perc=$((vms * 100 / vm_count)) 209 cpus_per_node=$((spdk_cpu_num * perc / 100)) 210 cpus_per_node=$((cpus_per_node == 0 ? 1 : cpus_per_node)) 211 212 if ((${#node_all_cpus[@]} == 0)); then 213 printf 'No CPUs left to allocate for SPDK on node%u. Need %u CPUs\n' \ 214 "$node" "$cpus_per_node" >&2 215 216 cpus_exhausted[node]=1 217 continue 218 fi 219 if ((${#node_all_cpus[@]} < cpus_per_node)); then 220 printf 'Not enough CPUs to allocate for SPDK on node%u. Need %u CPUs, getting %u\n' \ 221 "$node" "$cpus_per_node" "${#node_all_cpus[@]}" >&2 222 cpus_per_node=${#node_all_cpus[@]} 223 cpus_exhauseted[node]=1 224 fi 225 226 spdk+=("${node_all_cpus[@]::cpus_per_node}") 227 node_all_cpus=("${node_all_cpus[@]:cpus_per_node}") 228 cpus_remained+=("${node_all_cpus[@]}") 229 done 230 231 # If we didn't allocate the entire number of requested cpus in the initial run, 232 # adjust it by adding the remaining portion from the node having greater number 233 # of pinned VMs. 234 if ((${#spdk[@]} < spdk_cpu_num)); then 235 if [[ -n $ALIGN_FROM_ALL_NODES ]] && ((${#cpus_remained[@]} > 0)); then 236 printf 'Trying to get extra CPUs from all nodes\n' 237 local -n node_all_cpus=cpus_remained 238 else 239 node=${node_sort[0]#*:} 240 printf 'Trying to get extra CPUs from the dominant node%u to align: %u < %u\n' \ 241 "$node" "${#spdk[@]}" "$spdk_cpu_num" 242 if ((cpus_exhausted[node])); then 243 printf 'No CPUs available on node%u\n' "$node" 244 else 245 local -n node_all_cpus=node_${node}_cpu 246 fi 247 fi 248 spdk+=("${node_all_cpus[@]::spdk_cpu_num-${#spdk[@]}}") 249 fi >&2 250 if ((${#spdk[@]} != spdk_cpu_num)); then 251 printf 'Different number of SPDK CPUs allocated to meet the requirements: requested %u, got %u\n' \ 252 "$spdk_cpu_num" "${#spdk[@]}" 253 else 254 printf 'Requested number of SPDK CPUs allocated: %u\n' "$spdk_cpu_num" 255 fi >&2 256} 257 258_p_disk_map() { 259 ((${#auto_disk_map[@]} > 0)) || return 0 260 printf '%s\n' "${auto_disk_map[@]}" 261} 262 263_p_cpu_map() { 264 ((${#auto_cpu_map[@]} > 0)) || return 0 265 printf '%s\n' "${auto_cpu_map[@]}" 266} 267 268p_disk_map() { 269 cat <<- DISK_MAP 270 # Generated automatically by ${0##*/} 271 # NVMe Drives: ${#auto_disk_map[@]} VM count: $vm_count 272 $(_p_disk_map) 273 DISK_MAP 274} 275 276p_vms_in_node() { 277 ((${#vm_numa_map[@]} > 0)) || return 0 278 279 local node vms 280 for node in "${!vm_numa_map[@]}"; do 281 vms=(${!vm_numa_map[node]}) 282 echo "Node$node: ${#vms[@]} VMs" 283 done 284} 285 286p_cpu_map() { 287 local node_stats 288 289 mapfile -t node_stats < <(p_vms_in_node) 290 cat <<- CPU_MAP 291 # Generated automatically by ${0##*/} 292 # VM NUMA Nodes: ${#vm_numa_map[@]} VM count: $vm_count CPU Per VM: $vm_cpu_num SPDK CPU count: ${#spdk[@]} 293 $(printf '# - %s\n' "${node_stats[@]}") 294 $(_p_cpu_map) 295 CPU_MAP 296} 297 298p_all() { 299 p_disk_map 300 printf '\n' 301 p_cpu_map 302} 303 304fetch_env() { 305 spdk_cpu_num=${spdk_cpu_num:-1} 306 vm_count=${vm_count:-1} 307 vm_cpu_num=${vm_cpu_num:-1} 308 vms_per_nvme=${vms_per_nvme:-1} 309 310 # Normalize 311 spdk_cpu_num=$((spdk_cpu_num <= 0 ? 1 : spdk_cpu_num)) 312 vm_count=$((vm_count <= 0 ? 1 : vm_count)) 313 vm_cpu_num=$((vm_cpu_num <= 0 ? 1 : vm_cpu_num)) 314 vms_per_nvme=$((vms_per_nvme <= 0 ? 1 : vms_per_nvme)) 315 316 cpu_out=${cpu_out:-"$PWD/auto-cpu.conf"} 317 disk_out=${disk_out:-"$PWD/auto-disk.conf"} 318} 319 320help() { 321 cat <<- HELP 322 ${0##*/}: [-p all|cpu|disk -s] 323 324 Configuration is generated based on system's cpu and nvme topology. Parameters 325 taken directly from the environment: 326 327 spdk_cpu_list - list of CPUs to assign to a SPDK app 328 spdk_cpu_num - number of CPUs to use across all NUMA nodes 329 (spdk_cpu_list takes priority, default: 1) 330 vm_count - number of VMs to prepare the configuration for 331 (default: 1) 332 vm_cpu_num - number of CPUs to assign per VM (default: 1) 333 vms_per_nvme - Number of VMs to pin to a single nvme (default: 1) 334 335 Override parameters: 336 VM[N]_NODE - overrides selected NUMA node for VM N - by default, 337 this is allocated up to number of nvme drives 338 cpu_out - with -s, points at location where to save cpu conf 339 disk_out - with -s, points at location where to save disk conf 340 341 Note: VMs are pinned to nvme drives based on their NUMA location. 342 343 Example: 344 # Allocate 6 cpus from node1 for SPDK. Configure 24 VMs, 2 CPUs per VM 345 $ export spdk_cpu_num=6 vm_count=24 vm_cpu_num=2 346 $ ${0##*/} -p all 347 HELP 348} 349 350print="" 351save=no 352 353fetch_env 354 355while getopts :hsp: arg; do 356 case "$arg" in 357 h) 358 help 359 exit 0 360 ;; 361 p) print=$OPTARG ;; 362 s) save=yes ;; 363 *) ;; 364 esac 365done 366 367get_auto_cfg || exit 1 368 369case "$print" in 370 all) p_all ;; 371 cpu) p_cpu_map ;; 372 disk) p_disk_map ;; 373 *) ;; 374esac 375 376if [[ $save == yes ]]; then 377 p_cpu_map > "$cpu_out" 378 p_disk_map > "$disk_out" 379fi 380