1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2022 Microsoft Corporation
3 */
4
5 #include <rte_malloc.h>
6 #include <ethdev_driver.h>
7 #include <rte_eal_paging.h>
8
9 #include <infiniband/verbs.h>
10
11 #include "mana.h"
12
13 struct mana_range {
14 uintptr_t start;
15 uintptr_t end;
16 uint32_t len;
17 };
18
19 void
mana_mempool_chunk_cb(struct rte_mempool * mp __rte_unused,void * opaque,struct rte_mempool_memhdr * memhdr,unsigned int idx)20 mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque,
21 struct rte_mempool_memhdr *memhdr, unsigned int idx)
22 {
23 struct mana_range *ranges = opaque;
24 struct mana_range *range = &ranges[idx];
25 uint64_t page_size = rte_mem_page_size();
26
27 range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size);
28 range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len,
29 page_size);
30 range->len = range->end - range->start;
31 }
32
33 /*
34 * Register all memory regions from pool.
35 */
36 int
mana_new_pmd_mr(struct mana_mr_btree * local_tree,struct mana_priv * priv,struct rte_mempool * pool)37 mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
38 struct rte_mempool *pool)
39 {
40 struct ibv_mr *ibv_mr;
41 struct mana_range ranges[pool->nb_mem_chunks];
42 uint32_t i;
43 struct mana_mr_cache mr;
44 int ret;
45
46 rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges);
47
48 for (i = 0; i < pool->nb_mem_chunks; i++) {
49 if (ranges[i].len > priv->max_mr_size) {
50 DP_LOG(ERR, "memory chunk size %u exceeding max MR",
51 ranges[i].len);
52 return -ENOMEM;
53 }
54
55 DP_LOG(DEBUG,
56 "registering memory chunk start 0x%" PRIxPTR " len %u",
57 ranges[i].start, ranges[i].len);
58
59 if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
60 /* Send a message to the primary to do MR */
61 ret = mana_mp_req_mr_create(priv, ranges[i].start,
62 ranges[i].len);
63 if (ret) {
64 DP_LOG(ERR,
65 "MR failed start 0x%" PRIxPTR " len %u",
66 ranges[i].start, ranges[i].len);
67 return ret;
68 }
69 continue;
70 }
71
72 ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start,
73 ranges[i].len, IBV_ACCESS_LOCAL_WRITE);
74 if (ibv_mr) {
75 DP_LOG(DEBUG, "MR lkey %u addr %p len %zu",
76 ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
77
78 mr.lkey = ibv_mr->lkey;
79 mr.addr = (uintptr_t)ibv_mr->addr;
80 mr.len = ibv_mr->length;
81 mr.verb_obj = ibv_mr;
82
83 rte_spinlock_lock(&priv->mr_btree_lock);
84 ret = mana_mr_btree_insert(&priv->mr_btree, &mr);
85 rte_spinlock_unlock(&priv->mr_btree_lock);
86 if (ret) {
87 ibv_dereg_mr(ibv_mr);
88 DP_LOG(ERR, "Failed to add to global MR btree");
89 return ret;
90 }
91
92 ret = mana_mr_btree_insert(local_tree, &mr);
93 if (ret) {
94 /* Don't need to clean up MR as it's already
95 * in the global tree
96 */
97 DP_LOG(ERR, "Failed to add to local MR btree");
98 return ret;
99 }
100 } else {
101 DP_LOG(ERR, "MR failed at 0x%" PRIxPTR " len %u",
102 ranges[i].start, ranges[i].len);
103 return -errno;
104 }
105 }
106 return 0;
107 }
108
109 /*
110 * Deregister a MR.
111 */
112 void
mana_del_pmd_mr(struct mana_mr_cache * mr)113 mana_del_pmd_mr(struct mana_mr_cache *mr)
114 {
115 int ret;
116 struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj;
117
118 ret = ibv_dereg_mr(ibv_mr);
119 if (ret)
120 DP_LOG(ERR, "dereg MR failed ret %d", ret);
121 }
122
123 /*
124 * Alloc a MR.
125 * Try to find a MR in the cache. If not found, register a new MR.
126 */
127 struct mana_mr_cache *
mana_alloc_pmd_mr(struct mana_mr_btree * local_mr_btree,struct mana_priv * priv,struct rte_mbuf * mbuf)128 mana_alloc_pmd_mr(struct mana_mr_btree *local_mr_btree, struct mana_priv *priv,
129 struct rte_mbuf *mbuf)
130 {
131 struct rte_mempool *pool = mbuf->pool;
132 int ret, second_try = 0;
133 struct mana_mr_cache *mr;
134 uint16_t idx;
135
136 DP_LOG(DEBUG, "finding mr for mbuf addr %p len %d",
137 mbuf->buf_addr, mbuf->buf_len);
138
139 try_again:
140 /* First try to find the MR in local queue tree */
141 ret = mana_mr_btree_lookup(local_mr_btree, &idx,
142 (uintptr_t)mbuf->buf_addr, mbuf->buf_len,
143 &mr);
144 if (ret)
145 return NULL;
146
147 if (mr) {
148 DP_LOG(DEBUG, "Local mr lkey %u addr 0x%" PRIxPTR " len %zu",
149 mr->lkey, mr->addr, mr->len);
150 return mr;
151 }
152
153 /* If not found, try to find the MR in global tree */
154 rte_spinlock_lock(&priv->mr_btree_lock);
155 ret = mana_mr_btree_lookup(&priv->mr_btree, &idx,
156 (uintptr_t)mbuf->buf_addr,
157 mbuf->buf_len, &mr);
158 rte_spinlock_unlock(&priv->mr_btree_lock);
159
160 if (ret)
161 return NULL;
162
163 /* If found in the global tree, add it to the local tree */
164 if (mr) {
165 ret = mana_mr_btree_insert(local_mr_btree, mr);
166 if (ret) {
167 DP_LOG(ERR, "Failed to add MR to local tree.");
168 return NULL;
169 }
170
171 DP_LOG(DEBUG,
172 "Added local MR key %u addr 0x%" PRIxPTR " len %zu",
173 mr->lkey, mr->addr, mr->len);
174 return mr;
175 }
176
177 if (second_try) {
178 DP_LOG(ERR, "Internal error second try failed");
179 return NULL;
180 }
181
182 ret = mana_new_pmd_mr(local_mr_btree, priv, pool);
183 if (ret) {
184 DP_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d",
185 ret, mbuf->buf_addr, mbuf->buf_len);
186 return NULL;
187 }
188
189 second_try = 1;
190 goto try_again;
191 }
192
193 void
mana_remove_all_mr(struct mana_priv * priv)194 mana_remove_all_mr(struct mana_priv *priv)
195 {
196 struct mana_mr_btree *bt = &priv->mr_btree;
197 struct mana_mr_cache *mr;
198 struct ibv_mr *ibv_mr;
199 uint16_t i;
200
201 rte_spinlock_lock(&priv->mr_btree_lock);
202 /* Start with index 1 as the 1st entry is always NULL */
203 for (i = 1; i < bt->len; i++) {
204 mr = &bt->table[i];
205 ibv_mr = mr->verb_obj;
206 ibv_dereg_mr(ibv_mr);
207 }
208 bt->len = 1;
209 rte_spinlock_unlock(&priv->mr_btree_lock);
210 }
211
212 /*
213 * Expand the MR cache.
214 * MR cache is maintained as a btree and expand on demand.
215 */
216 static int
mana_mr_btree_expand(struct mana_mr_btree * bt,int n)217 mana_mr_btree_expand(struct mana_mr_btree *bt, int n)
218 {
219 void *mem;
220
221 mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache),
222 0, bt->socket);
223 if (!mem) {
224 DP_LOG(ERR, "Failed to expand btree size %d", n);
225 return -1;
226 }
227
228 DP_LOG(ERR, "Expanded btree to size %d", n);
229 bt->table = mem;
230 bt->size = n;
231
232 return 0;
233 }
234
235 /*
236 * Look for a region of memory in MR cache.
237 */
mana_mr_btree_lookup(struct mana_mr_btree * bt,uint16_t * idx,uintptr_t addr,size_t len,struct mana_mr_cache ** cache)238 int mana_mr_btree_lookup(struct mana_mr_btree *bt, uint16_t *idx,
239 uintptr_t addr, size_t len,
240 struct mana_mr_cache **cache)
241 {
242 struct mana_mr_cache *table;
243 uint16_t n;
244 uint16_t base = 0;
245 int ret;
246
247 *cache = NULL;
248
249 n = bt->len;
250 /* Try to double the cache if it's full */
251 if (n == bt->size) {
252 ret = mana_mr_btree_expand(bt, bt->size << 1);
253 if (ret)
254 return ret;
255 }
256
257 table = bt->table;
258
259 /* Do binary search on addr */
260 do {
261 uint16_t delta = n >> 1;
262
263 if (addr < table[base + delta].addr) {
264 n = delta;
265 } else {
266 base += delta;
267 n -= delta;
268 }
269 } while (n > 1);
270
271 *idx = base;
272
273 if (addr + len <= table[base].addr + table[base].len) {
274 *cache = &table[base];
275 return 0;
276 }
277
278 DP_LOG(DEBUG,
279 "addr 0x%" PRIxPTR " len %zu idx %u sum 0x%" PRIxPTR " not found",
280 addr, len, *idx, addr + len);
281
282 return 0;
283 }
284
285 int
mana_mr_btree_init(struct mana_mr_btree * bt,int n,int socket)286 mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
287 {
288 memset(bt, 0, sizeof(*bt));
289 bt->table = rte_calloc_socket("MANA B-tree table",
290 n,
291 sizeof(struct mana_mr_cache),
292 0, socket);
293 if (!bt->table) {
294 DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d",
295 n, socket);
296 return -ENOMEM;
297 }
298
299 bt->socket = socket;
300 bt->size = n;
301
302 /* First entry must be NULL for binary search to work */
303 bt->table[0] = (struct mana_mr_cache) {
304 .lkey = UINT32_MAX,
305 };
306 bt->len = 1;
307
308 DRV_LOG(ERR, "B-tree initialized table %p size %d len %d",
309 bt->table, n, bt->len);
310
311 return 0;
312 }
313
314 void
mana_mr_btree_free(struct mana_mr_btree * bt)315 mana_mr_btree_free(struct mana_mr_btree *bt)
316 {
317 rte_free(bt->table);
318 memset(bt, 0, sizeof(*bt));
319 }
320
321 int
mana_mr_btree_insert(struct mana_mr_btree * bt,struct mana_mr_cache * entry)322 mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry)
323 {
324 struct mana_mr_cache *table;
325 uint16_t idx = 0;
326 uint16_t shift;
327 int ret;
328
329 ret = mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len, &table);
330 if (ret)
331 return ret;
332
333 if (table) {
334 DP_LOG(DEBUG, "Addr 0x%" PRIxPTR " len %zu exists in btree",
335 entry->addr, entry->len);
336 return 0;
337 }
338
339 if (bt->len >= bt->size) {
340 DP_LOG(ERR, "Btree overflow detected len %u size %u",
341 bt->len, bt->size);
342 bt->overflow = 1;
343 return -1;
344 }
345
346 table = bt->table;
347
348 idx++;
349 shift = (bt->len - idx) * sizeof(struct mana_mr_cache);
350 if (shift) {
351 DP_LOG(DEBUG, "Moving %u bytes from idx %u to %u",
352 shift, idx, idx + 1);
353 memmove(&table[idx + 1], &table[idx], shift);
354 }
355
356 table[idx] = *entry;
357 bt->len++;
358
359 DP_LOG(DEBUG,
360 "Inserted MR b-tree table %p idx %d addr 0x%" PRIxPTR " len %zu",
361 table, idx, entry->addr, entry->len);
362
363 return 0;
364 }
365