1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2022 Intel Corporation. 4# All rights reserved. 5 6curdir=$(readlink -f "$(dirname "$0")") 7rootdir=$(readlink -f "$curdir/../../../") 8 9source "$rootdir/scripts/common.sh" 10source "$rootdir/test/scheduler/common.sh" 11 12get_auto_cfg() { 13 local vm_cpus vm_node vm vms 14 local cpu node nodes_idxs node_idx 15 local nvmes nvme nvme_idx nvme_diff nvmes_per_node 16 local vm_diff aligned_number_of_vms=0 17 local diff iter 18 19 local -g auto_cpu_map=() auto_disk_map=() spdk=() 20 21 map_cpus 22 get_nvme_numa_map 23 24 nodes_idxs=("${!nodes[@]}") 25 26 # Construct initial NUMA-aware setup by pinning VM to given nvme's node. First run is meant 27 # to pin enough number of VMs (as per vm_count) to match the number of available nvme ctrls. 28 vm=0 29 for node in "${nodes_idxs[@]}"; do 30 nvmes=(${!nvme_numa_map[node]}) 31 for ((nvme_idx = 0; nvme_idx < ${#nvmes[@]} && vm < vm_count; vm++, nvme_idx++)); do 32 eval "vm${vm}_node=$node" 33 done 34 nvmes_per_node[node]=${#nvmes[@]} 35 done 36 37 vm_diff=$((vm_count - vm)) 38 39 # Align extra number of VMs in case nvme ctrls are not distributed evenly across the existing 40 # NUMA nodes. 41 # FIXME: This is targeted for systems with only 2 NUMA nodes. Technically, kernel supports 42 # more than that - it's possible to achieve setups with > 2 NUMA nodes under virtual env 43 # for instance. Should this be of any concern? 44 if ((nvmes_per_node[0] < nvmes_per_node[1])); then 45 nvme_diff=$((nvmes_per_node[1] - nvmes_per_node[0])) 46 elif ((nvmes_per_node[0] > nvmes_per_node[1])); then 47 nvme_diff=$((nvmes_per_node[0] - nvmes_per_node[1])) 48 else 49 nvme_diff=0 50 fi 51 52 diff=$((vm_diff + nvme_diff)) 53 54 if ((diff % 2 == 0)); then 55 aligned_number_of_vms=$((diff / ${#nodes_idxs[@]})) 56 fi 57 58 # Second run distributes extra VMs across existing NUMA nodes. In case we can distribute even 59 # number of extra VMs (as per vm_count) then simply assign them in bulk. In case there's an 60 # odd number, do some simple rr balancing where we assign them one by one - first to node0, 61 # second to node1, third to node0, etc. 62 if ((aligned_number_of_vms)); then 63 for node in "${nodes_idxs[@]}"; do 64 for ((iter = 0; iter < aligned_number_of_vms && vm < vm_count; iter++, vm++)); do 65 eval "vm${vm}_node=$node" 66 done 67 done 68 else 69 while ((vm < vm_count)); do 70 for node in "${nodes_idxs[@]}"; do 71 eval "vm${vm}_node=$node" 72 ((++vm)) 73 done 74 done 75 fi 76 77 local -g vm_numa_map=() 78 for ((vm = 0; vm < vm_count; vm++)); do 79 # Load balance the cpus across available numa nodes based on the pinning 80 # done prior. If there are no cpus left under selected node, iterate over 81 # all available nodes. If no cpus are left, fail. We don't allow to mix 82 # cpus from different nodes for the sake of the performance. 83 node_idx=0 node_idx_perc=0 84 eval "vm_node=\$vm${vm}_node" 85 86 local -n node_cpus=node_${vm_node}_cpu 87 local -n vm_nodes=node_${vm_node}_vm 88 89 vm_numa_map[vm_node]="node_${vm_node}_vm[@]" 90 91 while ((${#node_cpus[@]} < vm_cpu_num && node_idx < ${#nodes_idxs[@]})); do 92 vm_node=${nodes_idxs[node_idx]} 93 local -n node_cpus=node_${nodes_idxs[node_idx++]}_cpu 94 done 95 96 if ((${#node_cpus[@]} < vm_cpu_num)); then 97 printf 'Not enough CPUs available for VM %u (CPUs: %u, Nodes: %u, CPUs per VM: %u)\n' \ 98 "$vm" "${#cpus[@]}" "${#nodes_idxs[@]}" "$vm_cpu_num" >&2 99 return 1 100 fi 101 102 # Normalize indexes 103 node_cpus=("${node_cpus[@]}") 104 105 vm_cpus=("${node_cpus[@]::vm_cpu_num}") 106 node_cpus=("${node_cpus[@]:vm_cpu_num}") 107 108 auto_cpu_map+=("$( 109 cat <<- CPU_VM 110 VM_${vm}_qemu_mask=$( 111 IFS="," 112 echo "${vm_cpus[*]}" 113 ) 114 VM_${vm}_qemu_numa_node=$vm_node 115 CPU_VM 116 )") 117 118 # Save map of each VM->NUMA node to be able to construct a disk map in later steps. 119 vm_nodes+=("$vm") 120 done 121 122 # auto_cpu_map is ready, all requested VMs should be balanced across all NUMA nodes 123 # making sure each nvme drive will be bound to at least 1 VM placed on the 124 # corresponding NUMA node. Now, construct disk_cfg and assign VMs, with proper 125 # split value, to each nvme - extra VMs will be added to nvme drives in their 126 # bus order. 127 local -A nvme_vm_map=() 128 local iter nvmes_no=0 vms_no=0 _vms_per_nvme 129 for node in "${nodes_idxs[@]}"; do 130 if [[ ! -v nvme_numa_map[node] ]]; then 131 # There are no drives available on that node, skip it 132 continue 133 fi 134 nvmes=(${!nvme_numa_map[node]}) nvmes_no=${#nvmes[@]} 135 vms=(${!vm_numa_map[node]}) vms_no=${#vms[@]} 136 for ((iter = 0; iter <= (vms_no - nvmes_no <= 0 ? 1 : vms_no - nvmes_no); iter++)); do 137 for nvme in "${nvmes[@]}"; do 138 _vms_per_nvme=0 139 if ((${#vms[@]} == 0)); then 140 # No VMs on given node or they have been exhausted - skip all remaining drives. 141 continue 3 142 fi 143 nvme_vm_map["$nvme"]="_${nvme//[:.]/_}_[@]" 144 local -n nvme_vms=_${nvme//[:.]/_}_ 145 while ((++_vms_per_nvme <= vms_per_nvme)); do 146 nvme_vms+=("${vms[0]}") vms=("${vms[@]:1}") 147 done 148 done 149 done 150 done 151 152 local sorted_nvmes=() 153 sorted_nvmes=($(printf '%s\n' "${!nvme_vm_map[@]}" | sort)) 154 for nvme in "${!sorted_nvmes[@]}"; do 155 vms=(${!nvme_vm_map["${sorted_nvmes[nvme]}"]}) 156 auto_disk_map+=("${sorted_nvmes[nvme]},Nvme$((nvme++)),${#vms[*]},${vms[*]}") 157 done 158 159 get_spdk_cpus || return 1 160 161 auto_cpu_map+=("vhost_0_reactor_mask=[$( 162 IFS="," 163 echo "${spdk[*]}" 164 )]") 165 auto_cpu_map+=("vhost_0_master_core=${spdk[0]}") 166} 167 168get_nvme_numa_map() { 169 local nvmes nvme node 170 local -g nvme_numa_map=() 171 172 cache_pci_bus 173 174 for nvme in ${pci_bus_cache[0x010802]}; do 175 node=$(< "/sys/bus/pci/devices/$nvme/numa_node") 176 nvme_numa_map[node]="node_${node}_nvme[@]" 177 local -n node_nvmes=node_${node}_nvme 178 node_nvmes+=("$nvme") 179 done 180} 181 182get_spdk_cpus() { 183 local -g spdk=() 184 local node vms perc 185 local cpus_per_node cpus_exhausted=() cpus_remained=() 186 187 if [[ -z $spdk_cpu_num ]]; then 188 spdk=(0) 189 return 0 190 fi 191 192 if [[ -n $spdk_cpu_list ]]; then 193 spdk=($(parse_cpu_list <(echo "$spdk_cpu_list"))) 194 return 0 195 fi 196 197 # Start allocating from NUMA node with greater number of pinned VMs. 198 node_sort=($(for node in "${!vm_numa_map[@]}"; do 199 vms=(${!vm_numa_map[node]}) 200 echo "${#vms[@]}:$node" 201 done | sort -rn)) 202 203 for _node in "${node_sort[@]}"; do 204 node=${_node#*:} vms=${_node%:*} 205 local -n node_all_cpus=node_${node}_cpu 206 perc=$((vms * 100 / vm_count)) 207 cpus_per_node=$((spdk_cpu_num * perc / 100)) 208 cpus_per_node=$((cpus_per_node == 0 ? 1 : cpus_per_node)) 209 210 if ((${#node_all_cpus[@]} == 0)); then 211 printf 'No CPUs left to allocate for SPDK on node%u. Need %u CPUs\n' \ 212 "$node" "$cpus_per_node" >&2 213 214 cpus_exhausted[node]=1 215 continue 216 fi 217 if ((${#node_all_cpus[@]} < cpus_per_node)); then 218 printf 'Not enough CPUs to allocate for SPDK on node%u. Need %u CPUs, getting %u\n' \ 219 "$node" "$cpus_per_node" "${#node_all_cpus[@]}" >&2 220 cpus_per_node=${#node_all_cpus[@]} 221 cpus_exhauseted[node]=1 222 fi 223 224 spdk+=("${node_all_cpus[@]::cpus_per_node}") 225 node_all_cpus=("${node_all_cpus[@]:cpus_per_node}") 226 cpus_remained+=("${node_all_cpus[@]}") 227 done 228 229 # If we didn't allocate the entire number of requested cpus in the initial run, 230 # adjust it by adding the remaining portion from the node having greater number 231 # of pinned VMs. 232 if ((${#spdk[@]} < spdk_cpu_num)); then 233 if [[ -n $ALIGN_FROM_ALL_NODES ]] && ((${#cpus_remained[@]} > 0)); then 234 printf 'Trying to get extra CPUs from all nodes\n' 235 local -n node_all_cpus=cpus_remained 236 else 237 node=${node_sort[0]#*:} 238 printf 'Trying to get extra CPUs from the dominant node%u to align: %u < %u\n' \ 239 "$node" "${#spdk[@]}" "$spdk_cpu_num" 240 if ((cpus_exhausted[node])); then 241 printf 'No CPUs available on node%u\n' "$node" 242 else 243 local -n node_all_cpus=node_${node}_cpu 244 fi 245 fi 246 spdk+=("${node_all_cpus[@]::spdk_cpu_num-${#spdk[@]}}") 247 fi >&2 248 if ((${#spdk[@]} != spdk_cpu_num)); then 249 printf 'Different number of SPDK CPUs allocated to meet the requirements: requested %u, got %u\n' \ 250 "$spdk_cpu_num" "${#spdk[@]}" 251 else 252 printf 'Requested number of SPDK CPUs allocated: %u\n' "$spdk_cpu_num" 253 fi >&2 254} 255 256_p_disk_map() { 257 ((${#auto_disk_map[@]} > 0)) || return 0 258 printf '%s\n' "${auto_disk_map[@]}" 259} 260 261_p_cpu_map() { 262 ((${#auto_cpu_map[@]} > 0)) || return 0 263 printf '%s\n' "${auto_cpu_map[@]}" 264} 265 266p_disk_map() { 267 cat <<- DISK_MAP 268 # Generated automatically by ${0##*/} 269 # NVMe Drives: ${#auto_disk_map[@]} VM count: $vm_count 270 $(_p_disk_map) 271 DISK_MAP 272} 273 274p_vms_in_node() { 275 ((${#vm_numa_map[@]} > 0)) || return 0 276 277 local node vms 278 for node in "${!vm_numa_map[@]}"; do 279 vms=(${!vm_numa_map[node]}) 280 echo "Node$node: ${#vms[@]} VMs" 281 done 282} 283 284p_cpu_map() { 285 local node_stats 286 287 mapfile -t node_stats < <(p_vms_in_node) 288 cat <<- CPU_MAP 289 # Generated automatically by ${0##*/} 290 # VM NUMA Nodes: ${#vm_numa_map[@]} VM count: $vm_count CPU Per VM: $vm_cpu_num SPDK CPU count: ${#spdk[@]} 291 $(printf '# - %s\n' "${node_stats[@]}") 292 $(_p_cpu_map) 293 CPU_MAP 294} 295 296p_all() { 297 p_disk_map 298 printf '\n' 299 p_cpu_map 300} 301 302fetch_env() { 303 spdk_cpu_num=${spdk_cpu_num:-1} 304 vm_count=${vm_count:-1} 305 vm_cpu_num=${vm_cpu_num:-1} 306 vms_per_nvme=${vms_per_nvme:-1} 307 308 # Normalize 309 spdk_cpu_num=$((spdk_cpu_num <= 0 ? 1 : spdk_cpu_num)) 310 vm_count=$((vm_count <= 0 ? 1 : vm_count)) 311 vm_cpu_num=$((vm_cpu_num <= 0 ? 1 : vm_cpu_num)) 312 vms_per_nvme=$((vms_per_nvme <= 0 ? 1 : vms_per_nvme)) 313 314 cpu_out=${cpu_out:-"$PWD/auto-cpu.conf"} 315 disk_out=${disk_out:-"$PWD/auto-disk.conf"} 316} 317 318help() { 319 cat <<- HELP 320 ${0##*/}: [-p all|cpu|disk -s] 321 322 Configuration is generated based on system's cpu and nvme topology. Parameters 323 taken directly from the environment: 324 325 spdk_cpu_list - list of CPUs to assign to a SPDK app 326 spdk_cpu_num - number of CPUs to use across all NUMA nodes 327 (spdk_cpu_list takes priority, default: 1) 328 vm_count - number of VMs to prepare the configuration for 329 (default: 1) 330 vm_cpu_num - number of CPUs to assign per VM (default: 1) 331 vms_per_nvme - Number of VMs to pin to a single nvme (default: 1) 332 333 Override parameters: 334 vmN_node - overrides selected NUMA node for VM N - by default, 335 this is allocated up to number of nvme drives 336 cpu_out - with -s, points at location where to save cpu conf 337 disk_out - with -s, points at location where to save disk conf 338 339 Note: VMs are pinned to nvme drives based on their NUMA location. 340 341 Example: 342 # Allocate 6 cpus from node1 for SPDK. Configure 24 VMs, 2 CPUs per VM 343 $ export spdk_cpu_num=6 vm_count=24 vm_cpu_num=2 344 $ ${0##*/} -p all 345 HELP 346} 347 348print="" 349save=no 350 351fetch_env 352 353while getopts :hsp: arg; do 354 case "$arg" in 355 h) 356 help 357 exit 0 358 ;; 359 p) print=$OPTARG ;; 360 s) save=yes ;; 361 *) ;; 362 esac 363done 364 365get_auto_cfg || exit 1 366 367case "$print" in 368 all) p_all ;; 369 cpu) p_cpu_map ;; 370 disk) p_disk_map ;; 371 *) ;; 372esac 373 374if [[ $save == yes ]]; then 375 p_cpu_map > "$cpu_out" 376 p_disk_map > "$disk_out" 377fi 378