xref: /spdk/scripts/perf/vhost/conf-generator (revision 12fbe739a31b09aff0d05f354d4f3bbef99afc55)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2022 Intel Corporation.
4#  All rights reserved.
5
6curdir=$(readlink -f "$(dirname "$0")")
7rootdir=$(readlink -f "$curdir/../../../")
8
9source "$rootdir/scripts/common.sh"
10source "$rootdir/test/scheduler/common.sh"
11
12get_auto_cfg() {
13	local vm_cpus vm_node vm vms
14	local cpu node nodes_idxs node_idx
15	local nvmes nvme nvme_idx nvme_diff nvmes_per_node
16	local vm_diff aligned_number_of_vms=0
17	local diff iter
18
19	local -g auto_cpu_map=() auto_disk_map=() spdk=()
20
21	map_cpus
22	get_nvme_numa_map
23
24	nodes_idxs=("${!nodes[@]}")
25
26	# Construct initial NUMA-aware setup by pinning VM to given nvme's node. First run is meant
27	# to pin enough number of VMs (as per vm_count) to match the number of available nvme ctrls.
28	vm=0
29	for node in "${nodes_idxs[@]}"; do
30		nvmes=(${!nvme_numa_map[node]})
31		for ((nvme_idx = 0; nvme_idx < ${#nvmes[@]} && vm < vm_count; vm++, nvme_idx++)); do
32			eval "vm${vm}_node=$node"
33		done
34		nvmes_per_node[node]=${#nvmes[@]}
35	done
36
37	vm_diff=$((vm_count - vm))
38
39	# Align extra number of VMs in case nvme ctrls are not distributed evenly across the existing
40	# NUMA nodes.
41	# FIXME: This is targeted for systems with only 2 NUMA nodes. Technically, kernel supports
42	# more than that - it's possible to achieve setups with > 2 NUMA nodes under virtual env
43	# for instance. Should this be of any concern?
44	if ((nvmes_per_node[0] < nvmes_per_node[1])); then
45		nvme_diff=$((nvmes_per_node[1] - nvmes_per_node[0]))
46	elif ((nvmes_per_node[0] > nvmes_per_node[1])); then
47		nvme_diff=$((nvmes_per_node[0] - nvmes_per_node[1]))
48	else
49		nvme_diff=0
50	fi
51
52	diff=$((vm_diff + nvme_diff))
53
54	if ((diff % 2 == 0)); then
55		aligned_number_of_vms=$((diff / ${#nodes_idxs[@]}))
56	fi
57
58	# Second run distributes extra VMs across existing NUMA nodes. In case we can distribute even
59	# number of extra VMs (as per vm_count) then simply assign them in bulk. In case there's an
60	# odd number, do some simple rr balancing where we assign them one by one - first to node0,
61	# second to node1, third to node0, etc.
62	if ((aligned_number_of_vms)); then
63		for node in "${nodes_idxs[@]}"; do
64			for ((iter = 0; iter < aligned_number_of_vms && vm < vm_count; iter++, vm++)); do
65				eval "vm${vm}_node=$node"
66			done
67		done
68	else
69		while ((vm < vm_count)); do
70			for node in "${nodes_idxs[@]}"; do
71				eval "vm${vm}_node=$node"
72				((++vm))
73			done
74		done
75	fi
76
77	local -g vm_numa_map=()
78	for ((vm = 0; vm < vm_count; vm++)); do
79		# Load balance the cpus across available numa nodes based on the pinning
80		# done prior. If there are no cpus left under selected node, iterate over
81		# all available nodes. If no cpus are left, fail. We don't allow to mix
82		# cpus from different nodes for the sake of the performance.
83		node_idx=0 node_idx_perc=0
84		eval "vm_node=\$vm${vm}_node"
85
86		local -n node_cpus=node_${vm_node}_cpu
87		local -n vm_nodes=node_${vm_node}_vm
88
89		vm_numa_map[vm_node]="node_${vm_node}_vm[@]"
90
91		while ((${#node_cpus[@]} < vm_cpu_num && node_idx < ${#nodes_idxs[@]})); do
92			vm_node=${nodes_idxs[node_idx]}
93			local -n node_cpus=node_${nodes_idxs[node_idx++]}_cpu
94		done
95
96		if ((${#node_cpus[@]} < vm_cpu_num)); then
97			printf 'Not enough CPUs available for VM %u (CPUs: %u, Nodes: %u, CPUs per VM: %u)\n' \
98				"$vm" "${#cpus[@]}" "${#nodes_idxs[@]}" "$vm_cpu_num" >&2
99			return 1
100		fi
101
102		# Normalize indexes
103		node_cpus=("${node_cpus[@]}")
104
105		vm_cpus=("${node_cpus[@]::vm_cpu_num}")
106		node_cpus=("${node_cpus[@]:vm_cpu_num}")
107
108		auto_cpu_map+=("$(
109			cat <<- CPU_VM
110				VM_${vm}_qemu_mask=$(
111				IFS=","
112				echo "${vm_cpus[*]}"
113				)
114				VM_${vm}_qemu_numa_node=$vm_node
115			CPU_VM
116		)")
117
118		# Save map of each VM->NUMA node to be able to construct a disk map in later steps.
119		vm_nodes+=("$vm")
120	done
121
122	# auto_cpu_map is ready, all requested VMs should be balanced across all NUMA nodes
123	# making sure each nvme drive will be bound to at least 1 VM placed on the
124	# corresponding NUMA node. Now, construct disk_cfg and assign VMs, with proper
125	# split value, to each nvme - extra VMs will be added to nvme drives in their
126	# bus order.
127	local -A nvme_vm_map=()
128	local iter nvmes_no=0 vms_no=0 _vms_per_nvme
129	for node in "${nodes_idxs[@]}"; do
130		if [[ ! -v nvme_numa_map[node] ]]; then
131			# There are no drives available on that node, skip it
132			continue
133		fi
134		nvmes=(${!nvme_numa_map[node]}) nvmes_no=${#nvmes[@]}
135		vms=(${!vm_numa_map[node]}) vms_no=${#vms[@]}
136		for ((iter = 0; iter <= (vms_no - nvmes_no <= 0 ? 1 : vms_no - nvmes_no); iter++)); do
137			for nvme in "${nvmes[@]}"; do
138				_vms_per_nvme=0
139				if ((${#vms[@]} == 0)); then
140					# No VMs on given node or they have been exhausted - skip all remaining drives.
141					continue 3
142				fi
143				nvme_vm_map["$nvme"]="_${nvme//[:.]/_}_[@]"
144				local -n nvme_vms=_${nvme//[:.]/_}_
145				while ((++_vms_per_nvme <= vms_per_nvme)); do
146					nvme_vms+=("${vms[0]}") vms=("${vms[@]:1}")
147				done
148			done
149		done
150	done
151
152	local sorted_nvmes=()
153	sorted_nvmes=($(printf '%s\n' "${!nvme_vm_map[@]}" | sort))
154	for nvme in "${!sorted_nvmes[@]}"; do
155		vms=(${!nvme_vm_map["${sorted_nvmes[nvme]}"]})
156		auto_disk_map+=("${sorted_nvmes[nvme]},Nvme$((nvme++)),${#vms[*]},${vms[*]}")
157	done
158
159	get_spdk_cpus || return 1
160
161	auto_cpu_map+=("vhost_0_reactor_mask=[$(
162		IFS=","
163		echo "${spdk[*]}"
164	)]")
165	auto_cpu_map+=("vhost_0_master_core=${spdk[0]}")
166}
167
168get_nvme_numa_map() {
169	local nvmes nvme node
170	local -g nvme_numa_map=()
171
172	cache_pci_bus
173
174	for nvme in ${pci_bus_cache[0x010802]}; do
175		node=$(< "/sys/bus/pci/devices/$nvme/numa_node")
176		nvme_numa_map[node]="node_${node}_nvme[@]"
177		local -n node_nvmes=node_${node}_nvme
178		node_nvmes+=("$nvme")
179	done
180}
181
182get_spdk_cpus() {
183	local -g spdk=()
184	local node vms perc
185	local cpus_per_node cpus_exhausted=() cpus_remained=()
186
187	if [[ -z $spdk_cpu_num ]]; then
188		spdk=(0)
189		return 0
190	fi
191
192	if [[ -n $spdk_cpu_list ]]; then
193		spdk=($(parse_cpu_list <(echo "$spdk_cpu_list")))
194		return 0
195	fi
196
197	# Start allocating from NUMA node with greater number of pinned VMs.
198	node_sort=($(for node in "${!vm_numa_map[@]}"; do
199		vms=(${!vm_numa_map[node]})
200		echo "${#vms[@]}:$node"
201	done | sort -rn))
202
203	for _node in "${node_sort[@]}"; do
204		node=${_node#*:} vms=${_node%:*}
205		local -n node_all_cpus=node_${node}_cpu
206		perc=$((vms * 100 / vm_count))
207		cpus_per_node=$((spdk_cpu_num * perc / 100))
208		cpus_per_node=$((cpus_per_node == 0 ? 1 : cpus_per_node))
209
210		if ((${#node_all_cpus[@]} == 0)); then
211			printf 'No CPUs left to allocate for SPDK on node%u. Need %u CPUs\n' \
212				"$node" "$cpus_per_node" >&2
213
214			cpus_exhausted[node]=1
215			continue
216		fi
217		if ((${#node_all_cpus[@]} < cpus_per_node)); then
218			printf 'Not enough CPUs to allocate for SPDK on node%u. Need %u CPUs, getting %u\n' \
219				"$node" "$cpus_per_node" "${#node_all_cpus[@]}" >&2
220			cpus_per_node=${#node_all_cpus[@]}
221			cpus_exhauseted[node]=1
222		fi
223
224		spdk+=("${node_all_cpus[@]::cpus_per_node}")
225		node_all_cpus=("${node_all_cpus[@]:cpus_per_node}")
226		cpus_remained+=("${node_all_cpus[@]}")
227	done
228
229	# If we didn't allocate the entire number of requested cpus in the initial run,
230	# adjust it by adding the remaining portion from the node having greater number
231	# of pinned VMs.
232	if ((${#spdk[@]} < spdk_cpu_num)); then
233		if [[ -n $ALIGN_FROM_ALL_NODES ]] && ((${#cpus_remained[@]} > 0)); then
234			printf 'Trying to get extra CPUs from all nodes\n'
235			local -n node_all_cpus=cpus_remained
236		else
237			node=${node_sort[0]#*:}
238			printf 'Trying to get extra CPUs from the dominant node%u to align: %u < %u\n' \
239				"$node" "${#spdk[@]}" "$spdk_cpu_num"
240			if ((cpus_exhausted[node])); then
241				printf 'No CPUs available on node%u\n' "$node"
242			else
243				local -n node_all_cpus=node_${node}_cpu
244			fi
245		fi
246		spdk+=("${node_all_cpus[@]::spdk_cpu_num-${#spdk[@]}}")
247	fi >&2
248	if ((${#spdk[@]} != spdk_cpu_num)); then
249		printf 'Different number of SPDK CPUs allocated to meet the requirements: requested %u, got %u\n' \
250			"$spdk_cpu_num" "${#spdk[@]}"
251	else
252		printf 'Requested number of SPDK CPUs allocated: %u\n' "$spdk_cpu_num"
253	fi >&2
254}
255
256_p_disk_map() {
257	((${#auto_disk_map[@]} > 0)) || return 0
258	printf '%s\n' "${auto_disk_map[@]}"
259}
260
261_p_cpu_map() {
262	((${#auto_cpu_map[@]} > 0)) || return 0
263	printf '%s\n' "${auto_cpu_map[@]}"
264}
265
266p_disk_map() {
267	cat <<- DISK_MAP
268		# Generated automatically by ${0##*/}
269		# NVMe Drives: ${#auto_disk_map[@]} VM count: $vm_count
270		$(_p_disk_map)
271	DISK_MAP
272}
273
274p_vms_in_node() {
275	((${#vm_numa_map[@]} > 0)) || return 0
276
277	local node vms
278	for node in "${!vm_numa_map[@]}"; do
279		vms=(${!vm_numa_map[node]})
280		echo "Node$node: ${#vms[@]} VMs"
281	done
282}
283
284p_cpu_map() {
285	local node_stats
286
287	mapfile -t node_stats < <(p_vms_in_node)
288	cat <<- CPU_MAP
289		# Generated automatically by ${0##*/}
290		# VM NUMA Nodes: ${#vm_numa_map[@]} VM count: $vm_count CPU Per VM: $vm_cpu_num SPDK CPU count: ${#spdk[@]}
291		$(printf '#  - %s\n' "${node_stats[@]}")
292		$(_p_cpu_map)
293	CPU_MAP
294}
295
296p_all() {
297	p_disk_map
298	printf '\n'
299	p_cpu_map
300}
301
302fetch_env() {
303	spdk_cpu_num=${spdk_cpu_num:-1}
304	vm_count=${vm_count:-1}
305	vm_cpu_num=${vm_cpu_num:-1}
306	vms_per_nvme=${vms_per_nvme:-1}
307
308	# Normalize
309	spdk_cpu_num=$((spdk_cpu_num <= 0 ? 1 : spdk_cpu_num))
310	vm_count=$((vm_count <= 0 ? 1 : vm_count))
311	vm_cpu_num=$((vm_cpu_num <= 0 ? 1 : vm_cpu_num))
312	vms_per_nvme=$((vms_per_nvme <= 0 ? 1 : vms_per_nvme))
313
314	cpu_out=${cpu_out:-"$PWD/auto-cpu.conf"}
315	disk_out=${disk_out:-"$PWD/auto-disk.conf"}
316}
317
318help() {
319	cat <<- HELP
320		${0##*/}: [-p all|cpu|disk -s]
321
322		Configuration is generated based on system's cpu and nvme topology. Parameters
323		taken directly from the environment:
324
325		spdk_cpu_list - list of CPUs to assign to a SPDK app
326		spdk_cpu_num  - number of CPUs to use across all NUMA nodes
327		                (spdk_cpu_list takes priority, default: 1)
328		vm_count      - number of VMs to prepare the configuration for
329		                (default: 1)
330		vm_cpu_num    - number of CPUs to assign per VM (default: 1)
331		vms_per_nvme  - Number of VMs to pin to a single nvme (default: 1)
332
333		Override parameters:
334		vmN_node      - overrides selected NUMA node for VM N - by default,
335		                this is allocated up to number of nvme drives
336		cpu_out       - with -s, points at location where to save cpu conf
337		disk_out      - with -s, points at location where to save disk conf
338
339		Note: VMs are pinned to nvme drives based on their NUMA location.
340
341		Example:
342		# Allocate 6 cpus from node1 for SPDK. Configure 24 VMs, 2 CPUs per VM
343		$ export spdk_cpu_num=6 vm_count=24 vm_cpu_num=2
344		$ ${0##*/} -p all
345	HELP
346}
347
348print=""
349save=no
350
351fetch_env
352
353while getopts :hsp: arg; do
354	case "$arg" in
355		h)
356			help
357			exit 0
358			;;
359		p) print=$OPTARG ;;
360		s) save=yes ;;
361		*) ;;
362	esac
363done
364
365get_auto_cfg || exit 1
366
367case "$print" in
368	all) p_all ;;
369	cpu) p_cpu_map ;;
370	disk) p_disk_map ;;
371	*) ;;
372esac
373
374if [[ $save == yes ]]; then
375	p_cpu_map > "$cpu_out"
376	p_disk_map > "$disk_out"
377fi
378