xref: /dpdk/drivers/vdpa/mlx5/mlx5_vdpa_mem.c (revision e12a0166c80f65e35408f4715b2f3a60763c3741)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2019 Mellanox Technologies, Ltd
3  */
4 #include <stdlib.h>
5 
6 #include <rte_malloc.h>
7 #include <rte_errno.h>
8 #include <rte_common.h>
9 #include <rte_sched_common.h>
10 
11 #include <mlx5_prm.h>
12 #include <mlx5_common.h>
13 
14 #include "mlx5_vdpa_utils.h"
15 #include "mlx5_vdpa.h"
16 
17 void
mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv * priv)18 mlx5_vdpa_mem_dereg(struct mlx5_vdpa_priv *priv)
19 {
20 	struct mlx5_vdpa_query_mr *mrs =
21 		(struct mlx5_vdpa_query_mr *)priv->mrs;
22 	struct mlx5_vdpa_query_mr *entry;
23 	int i;
24 
25 	if (priv->mrs) {
26 		for (i = priv->num_mrs - 1; i >= 0; i--) {
27 			entry = &mrs[i];
28 			if (entry->is_indirect) {
29 				if (entry->mkey)
30 					claim_zero(
31 					mlx5_devx_cmd_destroy(entry->mkey));
32 			} else {
33 				if (entry->mr)
34 					claim_zero(
35 					mlx5_glue->dereg_mr(entry->mr));
36 			}
37 		}
38 		rte_free(priv->mrs);
39 		priv->mrs = NULL;
40 		priv->num_mrs = 0;
41 	}
42 	if (priv->vmem_info.vmem) {
43 		free(priv->vmem_info.vmem);
44 		priv->vmem_info.vmem = NULL;
45 	}
46 	priv->gpa_mkey_index = 0;
47 }
48 
49 static int
mlx5_vdpa_regions_addr_cmp(const void * a,const void * b)50 mlx5_vdpa_regions_addr_cmp(const void *a, const void *b)
51 {
52 	const struct rte_vhost_mem_region *region_a = a;
53 	const struct rte_vhost_mem_region *region_b = b;
54 
55 	if (region_a->guest_phys_addr < region_b->guest_phys_addr)
56 		return -1;
57 	if (region_a->guest_phys_addr > region_b->guest_phys_addr)
58 		return 1;
59 	return 0;
60 }
61 
62 #define KLM_NUM_MAX_ALIGN(sz) (RTE_ALIGN_CEIL(sz, MLX5_MAX_KLM_BYTE_COUNT) / \
63 			       MLX5_MAX_KLM_BYTE_COUNT)
64 
65 /*
66  * Allocate and sort the region list and choose indirect mkey mode:
67  *   1. Calculate GCD, guest memory size and indirect mkey entries num per mode.
68  *   2. Align GCD to the maximum allowed size(2G) and to be power of 2.
69  *   2. Decide the indirect mkey mode according to the next rules:
70  *         a. If both KLM_FBS entries number and KLM entries number are bigger
71  *            than the maximum allowed(MLX5_DEVX_MAX_KLM_ENTRIES) - error.
72  *         b. KLM mode if KLM_FBS entries number is bigger than the maximum
73  *            allowed(MLX5_DEVX_MAX_KLM_ENTRIES).
74  *         c. KLM mode if GCD is smaller than the minimum allowed(4K).
75  *         d. KLM mode if the total size of KLM entries is in one cache line
76  *            and the total size of KLM_FBS entries is not in one cache line.
77  *         e. Otherwise, KLM_FBS mode.
78  */
79 static struct rte_vhost_memory *
mlx5_vdpa_vhost_mem_regions_prepare(int vid,uint8_t * mode,uint64_t * mem_size,uint64_t * gcd,uint32_t * entries_num)80 mlx5_vdpa_vhost_mem_regions_prepare(int vid, uint8_t *mode, uint64_t *mem_size,
81 				    uint64_t *gcd, uint32_t *entries_num)
82 {
83 	struct rte_vhost_memory *mem;
84 	uint64_t size;
85 	uint64_t klm_entries_num = 0;
86 	uint64_t klm_fbs_entries_num;
87 	uint32_t i;
88 	int ret = rte_vhost_get_mem_table(vid, &mem);
89 
90 	if (ret < 0) {
91 		DRV_LOG(ERR, "Failed to get VM memory layout vid =%d.", vid);
92 		rte_errno = EINVAL;
93 		return NULL;
94 	}
95 	qsort(mem->regions, mem->nregions, sizeof(mem->regions[0]),
96 	      mlx5_vdpa_regions_addr_cmp);
97 	*mem_size = (mem->regions[(mem->nregions - 1)].guest_phys_addr) +
98 				      (mem->regions[(mem->nregions - 1)].size) -
99 					      (mem->regions[0].guest_phys_addr);
100 	*gcd = 0;
101 	for (i = 0; i < mem->nregions; ++i) {
102 		DRV_LOG(INFO,  "Region %u: HVA 0x%" PRIx64 ", GPA 0x%" PRIx64
103 			", size 0x%" PRIx64 ".", i,
104 			mem->regions[i].host_user_addr,
105 			mem->regions[i].guest_phys_addr, mem->regions[i].size);
106 		if (i > 0) {
107 			/* Hole handle. */
108 			size = mem->regions[i].guest_phys_addr -
109 				(mem->regions[i - 1].guest_phys_addr +
110 				 mem->regions[i - 1].size);
111 			*gcd = rte_get_gcd64(*gcd, size);
112 			klm_entries_num += KLM_NUM_MAX_ALIGN(size);
113 		}
114 		size = mem->regions[i].size;
115 		*gcd = rte_get_gcd64(*gcd, size);
116 		klm_entries_num += KLM_NUM_MAX_ALIGN(size);
117 	}
118 	if (*gcd > MLX5_MAX_KLM_BYTE_COUNT)
119 		*gcd = rte_get_gcd64(*gcd, MLX5_MAX_KLM_BYTE_COUNT);
120 	if (!RTE_IS_POWER_OF_2(*gcd)) {
121 		uint64_t candidate_gcd = rte_align64prevpow2(*gcd);
122 
123 		while (candidate_gcd > 1 && (*gcd % candidate_gcd))
124 			candidate_gcd /= 2;
125 		DRV_LOG(DEBUG, "GCD 0x%" PRIx64 " is not power of 2. Adjusted "
126 			"GCD is 0x%" PRIx64 ".", *gcd, candidate_gcd);
127 		*gcd = candidate_gcd;
128 	}
129 	klm_fbs_entries_num = *mem_size / *gcd;
130 	if (*gcd < MLX5_MIN_KLM_FIXED_BUFFER_SIZE || klm_fbs_entries_num >
131 	    MLX5_DEVX_MAX_KLM_ENTRIES ||
132 	    ((klm_entries_num * sizeof(struct mlx5_klm)) <=
133 	    RTE_CACHE_LINE_SIZE && (klm_fbs_entries_num *
134 				    sizeof(struct mlx5_klm)) >
135 							RTE_CACHE_LINE_SIZE)) {
136 		*mode = MLX5_MKC_ACCESS_MODE_KLM;
137 		*entries_num = klm_entries_num;
138 		DRV_LOG(INFO, "Indirect mkey mode is KLM.");
139 	} else {
140 		*mode = MLX5_MKC_ACCESS_MODE_KLM_FBS;
141 		*entries_num = klm_fbs_entries_num;
142 		DRV_LOG(INFO, "Indirect mkey mode is KLM Fixed Buffer Size.");
143 	}
144 	DRV_LOG(DEBUG, "Memory registration information: nregions = %u, "
145 		"mem_size = 0x%" PRIx64 ", GCD = 0x%" PRIx64
146 		", klm_fbs_entries_num = 0x%" PRIx64 ", klm_entries_num = 0x%"
147 		PRIx64 ".", mem->nregions, *mem_size, *gcd, klm_fbs_entries_num,
148 		klm_entries_num);
149 	if (*entries_num > MLX5_DEVX_MAX_KLM_ENTRIES) {
150 		DRV_LOG(ERR, "Failed to prepare memory of vid %d - memory is "
151 			"too fragmented.", vid);
152 		free(mem);
153 		return NULL;
154 	}
155 	return mem;
156 }
157 
158 static int
mlx5_vdpa_mem_cmp(struct rte_vhost_memory * mem1,struct rte_vhost_memory * mem2)159 mlx5_vdpa_mem_cmp(struct rte_vhost_memory *mem1, struct rte_vhost_memory *mem2)
160 {
161 	uint32_t i;
162 
163 	if (mem1->nregions != mem2->nregions)
164 		return -1;
165 	for (i = 0; i < mem1->nregions; i++) {
166 		if (mem1->regions[i].guest_phys_addr !=
167 		    mem2->regions[i].guest_phys_addr)
168 			return -1;
169 		if (mem1->regions[i].size != mem2->regions[i].size)
170 			return -1;
171 	}
172 	return 0;
173 }
174 
175 #define KLM_SIZE_MAX_ALIGN(sz) ((sz) > MLX5_MAX_KLM_BYTE_COUNT ? \
176 				MLX5_MAX_KLM_BYTE_COUNT : (sz))
177 
178 static int
mlx5_vdpa_create_indirect_mkey(struct mlx5_vdpa_priv * priv)179 mlx5_vdpa_create_indirect_mkey(struct mlx5_vdpa_priv *priv)
180 {
181 	struct mlx5_devx_mkey_attr mkey_attr;
182 	struct mlx5_vdpa_query_mr *mrs =
183 		(struct mlx5_vdpa_query_mr *)priv->mrs;
184 	struct mlx5_vdpa_query_mr *entry;
185 	struct rte_vhost_mem_region *reg;
186 	uint8_t mode = priv->vmem_info.mode;
187 	uint32_t entries_num = priv->vmem_info.entries_num;
188 	struct rte_vhost_memory *mem = priv->vmem_info.vmem;
189 	struct mlx5_klm klm_array[entries_num];
190 	uint64_t gcd = priv->vmem_info.gcd;
191 	int ret = -rte_errno;
192 	uint64_t klm_size;
193 	int klm_index = 0;
194 	uint64_t k;
195 	uint32_t i;
196 
197 	/* If it is the last entry, create indirect mkey. */
198 	for (i = 0; i < mem->nregions; i++) {
199 		entry = &mrs[i];
200 		reg = &mem->regions[i];
201 		if (i > 0) {
202 			uint64_t sadd;
203 			uint64_t empty_region_sz = reg->guest_phys_addr -
204 					  (mem->regions[i - 1].guest_phys_addr +
205 					   mem->regions[i - 1].size);
206 
207 			if (empty_region_sz > 0) {
208 				sadd = mem->regions[i - 1].guest_phys_addr +
209 				       mem->regions[i - 1].size;
210 				klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
211 				      KLM_SIZE_MAX_ALIGN(empty_region_sz) : gcd;
212 				for (k = 0; k < empty_region_sz;
213 				     k += klm_size) {
214 					klm_array[klm_index].byte_count =
215 						k + klm_size > empty_region_sz ?
216 						 empty_region_sz - k : klm_size;
217 					klm_array[klm_index].mkey =
218 							    priv->null_mr->lkey;
219 					klm_array[klm_index].address = sadd + k;
220 					klm_index++;
221 				}
222 			}
223 		}
224 		klm_size = mode == MLX5_MKC_ACCESS_MODE_KLM ?
225 					    KLM_SIZE_MAX_ALIGN(reg->size) : gcd;
226 		for (k = 0; k < reg->size; k += klm_size) {
227 			klm_array[klm_index].byte_count = k + klm_size >
228 					   reg->size ? reg->size - k : klm_size;
229 			klm_array[klm_index].mkey = entry->mr->lkey;
230 			klm_array[klm_index].address = reg->guest_phys_addr + k;
231 			klm_index++;
232 		}
233 	}
234 	memset(&mkey_attr, 0, sizeof(mkey_attr));
235 	mkey_attr.addr = (uintptr_t)(mem->regions[0].guest_phys_addr);
236 	mkey_attr.size = priv->vmem_info.size;
237 	mkey_attr.pd = priv->cdev->pdn;
238 	mkey_attr.umem_id = 0;
239 	/* Must be zero for KLM mode. */
240 	mkey_attr.log_entity_size = mode == MLX5_MKC_ACCESS_MODE_KLM_FBS ?
241 							  rte_log2_u64(gcd) : 0;
242 	mkey_attr.pg_access = 0;
243 	mkey_attr.klm_array = klm_array;
244 	mkey_attr.klm_num = klm_index;
245 	entry = &mrs[mem->nregions];
246 	entry->mkey = mlx5_devx_cmd_mkey_create(priv->cdev->ctx, &mkey_attr);
247 	if (!entry->mkey) {
248 		DRV_LOG(ERR, "Failed to create indirect Mkey.");
249 		rte_errno = -ret;
250 		return ret;
251 	}
252 	entry->is_indirect = 1;
253 	priv->gpa_mkey_index = entry->mkey->id;
254 	return 0;
255 }
256 
257 /*
258  * The target here is to group all the physical memory regions of the
259  * virtio device in one indirect mkey.
260  * For KLM Fixed Buffer Size mode (HW find the translation entry in one
261  * read according to the guest phisical address):
262  * All the sub-direct mkeys of it must be in the same size, hence, each
263  * one of them should be in the GCD size of all the virtio memory
264  * regions and the holes between them.
265  * For KLM mode (each entry may be in different size so HW must iterate
266  * the entries):
267  * Each virtio memory region and each hole between them have one entry,
268  * just need to cover the maximum allowed size(2G) by splitting entries
269  * which their associated memory regions are bigger than 2G.
270  * It means that each virtio memory region may be mapped to more than
271  * one direct mkey in the 2 modes.
272  * All the holes of invalid memory between the virtio memory regions
273  * will be mapped to the null memory region for security.
274  */
275 int
mlx5_vdpa_mem_register(struct mlx5_vdpa_priv * priv)276 mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv)
277 {
278 	void *mrs;
279 	uint8_t mode = 0;
280 	int ret = -rte_errno;
281 	uint32_t i, thrd_idx, data[1];
282 	RTE_ATOMIC(uint32_t) remaining_cnt = 0;
283 	RTE_ATOMIC(uint32_t) err_cnt = 0;
284 	uint32_t task_num = 0;
285 	struct rte_vhost_memory *mem = mlx5_vdpa_vhost_mem_regions_prepare
286 			(priv->vid, &mode, &priv->vmem_info.size,
287 			&priv->vmem_info.gcd, &priv->vmem_info.entries_num);
288 
289 	if (!mem)
290 		return -rte_errno;
291 	if (priv->vmem_info.vmem != NULL) {
292 		if (mlx5_vdpa_mem_cmp(mem, priv->vmem_info.vmem) == 0) {
293 			/* VM memory not changed, reuse resources. */
294 			free(mem);
295 			return 0;
296 		}
297 		mlx5_vdpa_mem_dereg(priv);
298 	}
299 	priv->vmem_info.vmem = mem;
300 	priv->vmem_info.mode = mode;
301 	priv->num_mrs = mem->nregions;
302 	if (!priv->num_mrs || priv->num_mrs >= MLX5_VDPA_MAX_MRS) {
303 		DRV_LOG(ERR,
304 		"Invalid number of memory regions.");
305 		goto error;
306 	}
307 	/* The last one is indirect mkey entry. */
308 	priv->num_mrs++;
309 	mrs = rte_zmalloc("mlx5 vDPA memory regions",
310 		sizeof(struct mlx5_vdpa_query_mr) * priv->num_mrs, 0);
311 	priv->mrs = mrs;
312 	if (!priv->mrs) {
313 		DRV_LOG(ERR, "Failed to allocate private memory regions.");
314 		goto error;
315 	}
316 	if (priv->use_c_thread) {
317 		uint32_t main_task_idx[mem->nregions];
318 
319 		for (i = 0; i < mem->nregions; i++) {
320 			thrd_idx = i % (conf_thread_mng.max_thrds + 1);
321 			if (!thrd_idx) {
322 				main_task_idx[task_num] = i;
323 				task_num++;
324 				continue;
325 			}
326 			thrd_idx = priv->last_c_thrd_idx + 1;
327 			if (thrd_idx >= conf_thread_mng.max_thrds)
328 				thrd_idx = 0;
329 			priv->last_c_thrd_idx = thrd_idx;
330 			data[0] = i;
331 			if (mlx5_vdpa_task_add(priv, thrd_idx,
332 				MLX5_VDPA_TASK_REG_MR,
333 				&remaining_cnt, &err_cnt,
334 				(void **)&data, 1)) {
335 				DRV_LOG(ERR,
336 				"Fail to add task mem region (%d)", i);
337 				main_task_idx[task_num] = i;
338 				task_num++;
339 			}
340 		}
341 		for (i = 0; i < task_num; i++) {
342 			ret = mlx5_vdpa_register_mr(priv,
343 					main_task_idx[i]);
344 			if (ret) {
345 				DRV_LOG(ERR,
346 				"Failed to register mem region %d.", i);
347 				goto error;
348 			}
349 		}
350 		if (mlx5_vdpa_c_thread_wait_bulk_tasks_done(&remaining_cnt,
351 			&err_cnt, 100)) {
352 			DRV_LOG(ERR,
353 			"Failed to wait register mem region tasks ready.");
354 			goto error;
355 		}
356 	} else {
357 		for (i = 0; i < mem->nregions; i++) {
358 			ret = mlx5_vdpa_register_mr(priv, i);
359 			if (ret) {
360 				DRV_LOG(ERR,
361 				"Failed to register mem region %d.", i);
362 				goto error;
363 			}
364 		}
365 	}
366 	ret = mlx5_vdpa_create_indirect_mkey(priv);
367 	if (ret) {
368 		DRV_LOG(ERR, "Failed to create indirect mkey .");
369 		goto error;
370 	}
371 	return 0;
372 error:
373 	mlx5_vdpa_mem_dereg(priv);
374 	rte_errno = -ret;
375 	return ret;
376 }
377 
378 int
mlx5_vdpa_register_mr(struct mlx5_vdpa_priv * priv,uint32_t idx)379 mlx5_vdpa_register_mr(struct mlx5_vdpa_priv *priv, uint32_t idx)
380 {
381 	struct rte_vhost_memory *mem = priv->vmem_info.vmem;
382 	struct mlx5_vdpa_query_mr *mrs =
383 		(struct mlx5_vdpa_query_mr *)priv->mrs;
384 	struct mlx5_vdpa_query_mr *entry;
385 	struct rte_vhost_mem_region *reg;
386 	int ret;
387 
388 	reg = &mem->regions[idx];
389 	entry = &mrs[idx];
390 	entry->mr = mlx5_glue->reg_mr_iova
391 				      (priv->cdev->pd,
392 				       (void *)(uintptr_t)(reg->host_user_addr),
393 				       reg->size, reg->guest_phys_addr,
394 				       IBV_ACCESS_LOCAL_WRITE);
395 	if (!entry->mr) {
396 		DRV_LOG(ERR, "Failed to create direct Mkey.");
397 		ret = -rte_errno;
398 		return ret;
399 	}
400 	entry->is_indirect = 0;
401 	return 0;
402 }
403