xref: /spdk/scripts/perf/vhost/conf-generator (revision 2d64c2871a4f9f6c7ae61edddc2407c23db832ed)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2022 Intel Corporation.
4#  All rights reserved.
5
6curdir=$(readlink -f "$(dirname "$0")")
7rootdir=$(readlink -f "$curdir/../../../")
8
9shopt -s nullglob extglob
10
11source "$rootdir/scripts/common.sh"
12source "$rootdir/test/scheduler/common.sh"
13
14get_auto_cfg() {
15	local vm_cpus vm_node vm vms
16	local cpu node nodes_idxs node_idx
17	local nvmes nvme nvme_idx nvme_diff nvmes_per_node
18	local vm_diff aligned_number_of_vms=0
19	local diff iter
20
21	local -g auto_cpu_map=() auto_disk_map=() spdk=()
22
23	map_cpus
24	get_nvme_numa_map
25
26	nodes_idxs=("${!nodes[@]}")
27
28	# Construct initial NUMA-aware setup by pinning VM to given nvme's node. First run is meant
29	# to pin enough number of VMs (as per vm_count) to match the number of available nvme ctrls.
30	vm=0
31	for node in "${nodes_idxs[@]}"; do
32		nvmes=(${!nvme_numa_map[node]})
33		for ((nvme_idx = 0; nvme_idx < ${#nvmes[@]} && vm < vm_count; vm++, nvme_idx++)); do
34			eval "vm${vm}_node=$node"
35		done
36		nvmes_per_node[node]=${#nvmes[@]}
37	done
38
39	vm_diff=$((vm_count - vm))
40
41	# Align extra number of VMs in case nvme ctrls are not distributed evenly across the existing
42	# NUMA nodes.
43	# FIXME: This is targeted for systems with only 2 NUMA nodes. Technically, kernel supports
44	# more than that - it's possible to achieve setups with > 2 NUMA nodes under virtual env
45	# for instance. Should this be of any concern?
46	if ((nvmes_per_node[0] < nvmes_per_node[1])); then
47		nvme_diff=$((nvmes_per_node[1] - nvmes_per_node[0]))
48	elif ((nvmes_per_node[0] > nvmes_per_node[1])); then
49		nvme_diff=$((nvmes_per_node[0] - nvmes_per_node[1]))
50	else
51		nvme_diff=0
52	fi
53
54	diff=$((vm_diff + nvme_diff))
55
56	if ((diff % 2 == 0)); then
57		aligned_number_of_vms=$((diff / ${#nodes_idxs[@]}))
58	fi
59
60	# Second run distributes extra VMs across existing NUMA nodes. In case we can distribute even
61	# number of extra VMs (as per vm_count) then simply assign them in bulk. In case there's an
62	# odd number, do some simple rr balancing where we assign them one by one - first to node0,
63	# second to node1, third to node0, etc.
64	if ((aligned_number_of_vms)); then
65		for node in "${nodes_idxs[@]}"; do
66			for ((iter = 0; iter < aligned_number_of_vms && vm < vm_count; iter++, vm++)); do
67				eval "vm${vm}_node=$node"
68			done
69		done
70	else
71		while ((vm < vm_count)); do
72			for node in "${nodes_idxs[@]}"; do
73				eval "vm${vm}_node=$node"
74				((++vm))
75			done
76		done
77	fi
78
79	local -g vm_numa_map=()
80	for ((vm = 0; vm < vm_count; vm++)); do
81		# Load balance the cpus across available numa nodes based on the pinning
82		# done prior. If there are no cpus left under selected node, iterate over
83		# all available nodes. If no cpus are left, fail. We don't allow to mix
84		# cpus from different nodes for the sake of the performance.
85		node_idx=0 node_idx_perc=0
86		eval "vm_node=\${VM${vm}_NODE:-\$vm${vm}_node}"
87
88		local -n node_cpus=node_${vm_node}_cpu
89		local -n vm_nodes=node_${vm_node}_vm
90
91		vm_numa_map[vm_node]="node_${vm_node}_vm[@]"
92
93		while ((${#node_cpus[@]} < vm_cpu_num && node_idx < ${#nodes_idxs[@]})); do
94			vm_node=${nodes_idxs[node_idx]}
95			local -n node_cpus=node_${nodes_idxs[node_idx++]}_cpu
96		done
97
98		if ((${#node_cpus[@]} < vm_cpu_num)); then
99			printf 'Not enough CPUs available for VM %u (CPUs: %u, Nodes: %u, CPUs per VM: %u)\n' \
100				"$vm" "${#cpus[@]}" "${#nodes_idxs[@]}" "$vm_cpu_num" >&2
101			return 1
102		fi
103
104		# Normalize indexes
105		node_cpus=("${node_cpus[@]}")
106
107		vm_cpus=("${node_cpus[@]::vm_cpu_num}")
108		node_cpus=("${node_cpus[@]:vm_cpu_num}")
109
110		auto_cpu_map+=("$(
111			cat <<- CPU_VM
112				VM_${vm}_qemu_mask=$(
113					IFS=","
114					echo "${vm_cpus[*]}"
115				)
116				VM_${vm}_qemu_numa_node=$vm_node
117			CPU_VM
118		)")
119
120		# Save map of each VM->NUMA node to be able to construct a disk map in later steps.
121		vm_nodes+=("$vm")
122	done
123
124	# auto_cpu_map is ready, all requested VMs should be balanced across all NUMA nodes
125	# making sure each nvme drive will be bound to at least 1 VM placed on the
126	# corresponding NUMA node. Now, construct disk_cfg and assign VMs, with proper
127	# split value, to each nvme - extra VMs will be added to nvme drives in their
128	# bus order.
129	local -A nvme_vm_map=()
130	local iter nvmes_no=0 vms_no=0 _vms_per_nvme
131	for node in "${nodes_idxs[@]}"; do
132		if [[ ! -v nvme_numa_map[node] ]]; then
133			# There are no drives available on that node, skip it
134			continue
135		fi
136		nvmes=(${!nvme_numa_map[node]}) nvmes_no=${#nvmes[@]}
137		vms=(${!vm_numa_map[node]}) vms_no=${#vms[@]}
138		for ((iter = 0; iter <= (vms_no - nvmes_no <= 0 ? 1 : vms_no - nvmes_no); iter++)); do
139			for nvme in "${nvmes[@]}"; do
140				_vms_per_nvme=0
141				if ((${#vms[@]} == 0)); then
142					# No VMs on given node or they have been exhausted - skip all remaining drives.
143					continue 3
144				fi
145				nvme_vm_map["$nvme"]="_${nvme//[:.]/_}_[@]"
146				local -n nvme_vms=_${nvme//[:.]/_}_
147				while ((++_vms_per_nvme <= vms_per_nvme)); do
148					nvme_vms+=("${vms[0]}") vms=("${vms[@]:1}")
149				done
150			done
151		done
152	done
153
154	local sorted_nvmes=()
155	sorted_nvmes=($(printf '%s\n' "${!nvme_vm_map[@]}" | sort))
156	for nvme in "${!sorted_nvmes[@]}"; do
157		vms=(${!nvme_vm_map["${sorted_nvmes[nvme]}"]})
158		auto_disk_map+=("${sorted_nvmes[nvme]},Nvme$((nvme++)),${#vms[*]},${vms[*]}")
159	done
160
161	get_spdk_cpus || return 1
162
163	auto_cpu_map+=("vhost_0_reactor_mask=[$(
164		IFS=","
165		echo "${spdk[*]}"
166	)]")
167	auto_cpu_map+=("vhost_0_main_core=${spdk[0]}")
168}
169
170get_nvme_numa_map() {
171	local nvmes nvme node
172	local -g nvme_numa_map=()
173
174	cache_pci_bus
175
176	for nvme in ${pci_bus_cache[0x010802]}; do
177		node=$(< "/sys/bus/pci/devices/$nvme/numa_node")
178		nvme_numa_map[node]="node_${node}_nvme[@]"
179		local -n node_nvmes=node_${node}_nvme
180		node_nvmes+=("$nvme")
181	done
182}
183
184get_spdk_cpus() {
185	local -g spdk=()
186	local node vms perc
187	local cpus_per_node cpus_exhausted=() cpus_remained=()
188
189	if [[ -z $spdk_cpu_num ]]; then
190		spdk=(0)
191		return 0
192	fi
193
194	if [[ -n $spdk_cpu_list ]]; then
195		spdk=($(parse_cpu_list <(echo "$spdk_cpu_list")))
196		return 0
197	fi
198
199	# Start allocating from NUMA node with greater number of pinned VMs.
200	node_sort=($(for node in "${!vm_numa_map[@]}"; do
201		vms=(${!vm_numa_map[node]})
202		echo "${#vms[@]}:$node"
203	done | sort -rn))
204
205	for _node in "${node_sort[@]}"; do
206		node=${_node#*:} vms=${_node%:*}
207		local -n node_all_cpus=node_${node}_cpu
208		perc=$((vms * 100 / vm_count))
209		cpus_per_node=$((spdk_cpu_num * perc / 100))
210		cpus_per_node=$((cpus_per_node == 0 ? 1 : cpus_per_node))
211
212		if ((${#node_all_cpus[@]} == 0)); then
213			printf 'No CPUs left to allocate for SPDK on node%u. Need %u CPUs\n' \
214				"$node" "$cpus_per_node" >&2
215
216			cpus_exhausted[node]=1
217			continue
218		fi
219		if ((${#node_all_cpus[@]} < cpus_per_node)); then
220			printf 'Not enough CPUs to allocate for SPDK on node%u. Need %u CPUs, getting %u\n' \
221				"$node" "$cpus_per_node" "${#node_all_cpus[@]}" >&2
222			cpus_per_node=${#node_all_cpus[@]}
223			cpus_exhauseted[node]=1
224		fi
225
226		spdk+=("${node_all_cpus[@]::cpus_per_node}")
227		node_all_cpus=("${node_all_cpus[@]:cpus_per_node}")
228		cpus_remained+=("${node_all_cpus[@]}")
229	done
230
231	# If we didn't allocate the entire number of requested cpus in the initial run,
232	# adjust it by adding the remaining portion from the node having greater number
233	# of pinned VMs.
234	if ((${#spdk[@]} < spdk_cpu_num)); then
235		if [[ -n $ALIGN_FROM_ALL_NODES ]] && ((${#cpus_remained[@]} > 0)); then
236			printf 'Trying to get extra CPUs from all nodes\n'
237			local -n node_all_cpus=cpus_remained
238		else
239			node=${node_sort[0]#*:}
240			printf 'Trying to get extra CPUs from the dominant node%u to align: %u < %u\n' \
241				"$node" "${#spdk[@]}" "$spdk_cpu_num"
242			if ((cpus_exhausted[node])); then
243				printf 'No CPUs available on node%u\n' "$node"
244			else
245				local -n node_all_cpus=node_${node}_cpu
246			fi
247		fi
248		spdk+=("${node_all_cpus[@]::spdk_cpu_num-${#spdk[@]}}")
249	fi >&2
250	if ((${#spdk[@]} != spdk_cpu_num)); then
251		printf 'Different number of SPDK CPUs allocated to meet the requirements: requested %u, got %u\n' \
252			"$spdk_cpu_num" "${#spdk[@]}"
253	else
254		printf 'Requested number of SPDK CPUs allocated: %u\n' "$spdk_cpu_num"
255	fi >&2
256}
257
258_p_disk_map() {
259	((${#auto_disk_map[@]} > 0)) || return 0
260	printf '%s\n' "${auto_disk_map[@]}"
261}
262
263_p_cpu_map() {
264	((${#auto_cpu_map[@]} > 0)) || return 0
265	printf '%s\n' "${auto_cpu_map[@]}"
266}
267
268p_disk_map() {
269	cat <<- DISK_MAP
270		# Generated automatically by ${0##*/}
271		# NVMe Drives: ${#auto_disk_map[@]} VM count: $vm_count
272		$(_p_disk_map)
273	DISK_MAP
274}
275
276p_vms_in_node() {
277	((${#vm_numa_map[@]} > 0)) || return 0
278
279	local node vms
280	for node in "${!vm_numa_map[@]}"; do
281		vms=(${!vm_numa_map[node]})
282		echo "Node$node: ${#vms[@]} VMs"
283	done
284}
285
286p_cpu_map() {
287	local node_stats
288
289	mapfile -t node_stats < <(p_vms_in_node)
290	cat <<- CPU_MAP
291		# Generated automatically by ${0##*/}
292		# VM NUMA Nodes: ${#vm_numa_map[@]} VM count: $vm_count CPU Per VM: $vm_cpu_num SPDK CPU count: ${#spdk[@]}
293		$(printf '#  - %s\n' "${node_stats[@]}")
294		$(_p_cpu_map)
295	CPU_MAP
296}
297
298p_all() {
299	p_disk_map
300	printf '\n'
301	p_cpu_map
302}
303
304fetch_env() {
305	spdk_cpu_num=${spdk_cpu_num:-1}
306	vm_count=${vm_count:-1}
307	vm_cpu_num=${vm_cpu_num:-1}
308	vms_per_nvme=${vms_per_nvme:-1}
309
310	# Normalize
311	spdk_cpu_num=$((spdk_cpu_num <= 0 ? 1 : spdk_cpu_num))
312	vm_count=$((vm_count <= 0 ? 1 : vm_count))
313	vm_cpu_num=$((vm_cpu_num <= 0 ? 1 : vm_cpu_num))
314	vms_per_nvme=$((vms_per_nvme <= 0 ? 1 : vms_per_nvme))
315
316	cpu_out=${cpu_out:-"$PWD/auto-cpu.conf"}
317	disk_out=${disk_out:-"$PWD/auto-disk.conf"}
318}
319
320help() {
321	cat <<- HELP
322		${0##*/}: [-p all|cpu|disk -s]
323
324		Configuration is generated based on system's cpu and nvme topology. Parameters
325		taken directly from the environment:
326
327		spdk_cpu_list - list of CPUs to assign to a SPDK app
328		spdk_cpu_num  - number of CPUs to use across all NUMA nodes
329		                (spdk_cpu_list takes priority, default: 1)
330		vm_count      - number of VMs to prepare the configuration for
331		                (default: 1)
332		vm_cpu_num    - number of CPUs to assign per VM (default: 1)
333		vms_per_nvme  - Number of VMs to pin to a single nvme (default: 1)
334
335		Override parameters:
336		VM[N]_NODE    - overrides selected NUMA node for VM N - by default,
337		                this is allocated up to number of nvme drives
338		cpu_out       - with -s, points at location where to save cpu conf
339		disk_out      - with -s, points at location where to save disk conf
340
341		Note: VMs are pinned to nvme drives based on their NUMA location.
342
343		Example:
344		# Allocate 6 cpus from node1 for SPDK. Configure 24 VMs, 2 CPUs per VM
345		$ export spdk_cpu_num=6 vm_count=24 vm_cpu_num=2
346		$ ${0##*/} -p all
347	HELP
348}
349
350print=""
351save=no
352
353fetch_env
354
355while getopts :hsp: arg; do
356	case "$arg" in
357		h)
358			help
359			exit 0
360			;;
361		p) print=$OPTARG ;;
362		s) save=yes ;;
363		*) ;;
364	esac
365done
366
367get_auto_cfg || exit 1
368
369case "$print" in
370	all) p_all ;;
371	cpu) p_cpu_map ;;
372	disk) p_disk_map ;;
373	*) ;;
374esac
375
376if [[ $save == yes ]]; then
377	p_cpu_map > "$cpu_out"
378	p_disk_map > "$disk_out"
379fi
380