1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2022 Microsoft Corporation 3 */ 4 5 #include <rte_malloc.h> 6 #include <ethdev_driver.h> 7 #include <rte_eal_paging.h> 8 9 #include <infiniband/verbs.h> 10 11 #include "mana.h" 12 13 struct mana_range { 14 uintptr_t start; 15 uintptr_t end; 16 uint32_t len; 17 }; 18 19 void 20 mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque, 21 struct rte_mempool_memhdr *memhdr, unsigned int idx) 22 { 23 struct mana_range *ranges = opaque; 24 struct mana_range *range = &ranges[idx]; 25 uint64_t page_size = rte_mem_page_size(); 26 27 range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size); 28 range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len, 29 page_size); 30 range->len = range->end - range->start; 31 } 32 33 /* 34 * Register all memory regions from pool. 35 */ 36 int 37 mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv, 38 struct rte_mempool *pool) 39 { 40 struct ibv_mr *ibv_mr; 41 struct mana_range ranges[pool->nb_mem_chunks]; 42 uint32_t i; 43 struct mana_mr_cache mr; 44 int ret; 45 46 rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges); 47 48 for (i = 0; i < pool->nb_mem_chunks; i++) { 49 if (ranges[i].len > priv->max_mr_size) { 50 DP_LOG(ERR, "memory chunk size %u exceeding max MR", 51 ranges[i].len); 52 return -ENOMEM; 53 } 54 55 DP_LOG(DEBUG, 56 "registering memory chunk start 0x%" PRIxPTR " len %u", 57 ranges[i].start, ranges[i].len); 58 59 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 60 /* Send a message to the primary to do MR */ 61 ret = mana_mp_req_mr_create(priv, ranges[i].start, 62 ranges[i].len); 63 if (ret) { 64 DP_LOG(ERR, 65 "MR failed start 0x%" PRIxPTR " len %u", 66 ranges[i].start, ranges[i].len); 67 return ret; 68 } 69 continue; 70 } 71 72 ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start, 73 ranges[i].len, IBV_ACCESS_LOCAL_WRITE); 74 if (ibv_mr) { 75 DP_LOG(DEBUG, "MR lkey %u addr %p len %zu", 76 ibv_mr->lkey, ibv_mr->addr, ibv_mr->length); 77 78 mr.lkey = ibv_mr->lkey; 79 mr.addr = (uintptr_t)ibv_mr->addr; 80 mr.len = ibv_mr->length; 81 mr.verb_obj = ibv_mr; 82 83 rte_spinlock_lock(&priv->mr_btree_lock); 84 ret = mana_mr_btree_insert(&priv->mr_btree, &mr); 85 rte_spinlock_unlock(&priv->mr_btree_lock); 86 if (ret) { 87 ibv_dereg_mr(ibv_mr); 88 DP_LOG(ERR, "Failed to add to global MR btree"); 89 return ret; 90 } 91 92 ret = mana_mr_btree_insert(local_tree, &mr); 93 if (ret) { 94 /* Don't need to clean up MR as it's already 95 * in the global tree 96 */ 97 DP_LOG(ERR, "Failed to add to local MR btree"); 98 return ret; 99 } 100 } else { 101 DP_LOG(ERR, "MR failed at 0x%" PRIxPTR " len %u", 102 ranges[i].start, ranges[i].len); 103 return -errno; 104 } 105 } 106 return 0; 107 } 108 109 /* 110 * Deregister a MR. 111 */ 112 void 113 mana_del_pmd_mr(struct mana_mr_cache *mr) 114 { 115 int ret; 116 struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj; 117 118 ret = ibv_dereg_mr(ibv_mr); 119 if (ret) 120 DP_LOG(ERR, "dereg MR failed ret %d", ret); 121 } 122 123 /* 124 * Alloc a MR. 125 * Try to find a MR in the cache. If not found, register a new MR. 126 */ 127 struct mana_mr_cache * 128 mana_alloc_pmd_mr(struct mana_mr_btree *local_mr_btree, struct mana_priv *priv, 129 struct rte_mbuf *mbuf) 130 { 131 struct rte_mempool *pool = mbuf->pool; 132 int ret, second_try = 0; 133 struct mana_mr_cache *mr; 134 uint16_t idx; 135 136 DP_LOG(DEBUG, "finding mr for mbuf addr %p len %d", 137 mbuf->buf_addr, mbuf->buf_len); 138 139 try_again: 140 /* First try to find the MR in local queue tree */ 141 ret = mana_mr_btree_lookup(local_mr_btree, &idx, 142 (uintptr_t)mbuf->buf_addr, mbuf->buf_len, 143 &mr); 144 if (ret) 145 return NULL; 146 147 if (mr) { 148 DP_LOG(DEBUG, "Local mr lkey %u addr 0x%" PRIxPTR " len %zu", 149 mr->lkey, mr->addr, mr->len); 150 return mr; 151 } 152 153 /* If not found, try to find the MR in global tree */ 154 rte_spinlock_lock(&priv->mr_btree_lock); 155 ret = mana_mr_btree_lookup(&priv->mr_btree, &idx, 156 (uintptr_t)mbuf->buf_addr, 157 mbuf->buf_len, &mr); 158 rte_spinlock_unlock(&priv->mr_btree_lock); 159 160 if (ret) 161 return NULL; 162 163 /* If found in the global tree, add it to the local tree */ 164 if (mr) { 165 ret = mana_mr_btree_insert(local_mr_btree, mr); 166 if (ret) { 167 DP_LOG(ERR, "Failed to add MR to local tree."); 168 return NULL; 169 } 170 171 DP_LOG(DEBUG, 172 "Added local MR key %u addr 0x%" PRIxPTR " len %zu", 173 mr->lkey, mr->addr, mr->len); 174 return mr; 175 } 176 177 if (second_try) { 178 DP_LOG(ERR, "Internal error second try failed"); 179 return NULL; 180 } 181 182 ret = mana_new_pmd_mr(local_mr_btree, priv, pool); 183 if (ret) { 184 DP_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d", 185 ret, mbuf->buf_addr, mbuf->buf_len); 186 return NULL; 187 } 188 189 second_try = 1; 190 goto try_again; 191 } 192 193 void 194 mana_remove_all_mr(struct mana_priv *priv) 195 { 196 struct mana_mr_btree *bt = &priv->mr_btree; 197 struct mana_mr_cache *mr; 198 struct ibv_mr *ibv_mr; 199 uint16_t i; 200 201 rte_spinlock_lock(&priv->mr_btree_lock); 202 /* Start with index 1 as the 1st entry is always NULL */ 203 for (i = 1; i < bt->len; i++) { 204 mr = &bt->table[i]; 205 ibv_mr = mr->verb_obj; 206 ibv_dereg_mr(ibv_mr); 207 } 208 bt->len = 1; 209 rte_spinlock_unlock(&priv->mr_btree_lock); 210 } 211 212 /* 213 * Expand the MR cache. 214 * MR cache is maintained as a btree and expand on demand. 215 */ 216 static int 217 mana_mr_btree_expand(struct mana_mr_btree *bt, int n) 218 { 219 void *mem; 220 221 mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache), 222 0, bt->socket); 223 if (!mem) { 224 DP_LOG(ERR, "Failed to expand btree size %d", n); 225 return -1; 226 } 227 228 DP_LOG(ERR, "Expanded btree to size %d", n); 229 bt->table = mem; 230 bt->size = n; 231 232 return 0; 233 } 234 235 /* 236 * Look for a region of memory in MR cache. 237 */ 238 int mana_mr_btree_lookup(struct mana_mr_btree *bt, uint16_t *idx, 239 uintptr_t addr, size_t len, 240 struct mana_mr_cache **cache) 241 { 242 struct mana_mr_cache *table; 243 uint16_t n; 244 uint16_t base = 0; 245 int ret; 246 247 *cache = NULL; 248 249 n = bt->len; 250 /* Try to double the cache if it's full */ 251 if (n == bt->size) { 252 ret = mana_mr_btree_expand(bt, bt->size << 1); 253 if (ret) 254 return ret; 255 } 256 257 table = bt->table; 258 259 /* Do binary search on addr */ 260 do { 261 uint16_t delta = n >> 1; 262 263 if (addr < table[base + delta].addr) { 264 n = delta; 265 } else { 266 base += delta; 267 n -= delta; 268 } 269 } while (n > 1); 270 271 *idx = base; 272 273 if (addr + len <= table[base].addr + table[base].len) { 274 *cache = &table[base]; 275 return 0; 276 } 277 278 DP_LOG(DEBUG, 279 "addr 0x%" PRIxPTR " len %zu idx %u sum 0x%" PRIxPTR " not found", 280 addr, len, *idx, addr + len); 281 282 return 0; 283 } 284 285 int 286 mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket) 287 { 288 memset(bt, 0, sizeof(*bt)); 289 bt->table = rte_calloc_socket("MANA B-tree table", 290 n, 291 sizeof(struct mana_mr_cache), 292 0, socket); 293 if (!bt->table) { 294 DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d", 295 n, socket); 296 return -ENOMEM; 297 } 298 299 bt->socket = socket; 300 bt->size = n; 301 302 /* First entry must be NULL for binary search to work */ 303 bt->table[0] = (struct mana_mr_cache) { 304 .lkey = UINT32_MAX, 305 }; 306 bt->len = 1; 307 308 DRV_LOG(ERR, "B-tree initialized table %p size %d len %d", 309 bt->table, n, bt->len); 310 311 return 0; 312 } 313 314 void 315 mana_mr_btree_free(struct mana_mr_btree *bt) 316 { 317 rte_free(bt->table); 318 memset(bt, 0, sizeof(*bt)); 319 } 320 321 int 322 mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry) 323 { 324 struct mana_mr_cache *table; 325 uint16_t idx = 0; 326 uint16_t shift; 327 int ret; 328 329 ret = mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len, &table); 330 if (ret) 331 return ret; 332 333 if (table) { 334 DP_LOG(DEBUG, "Addr 0x%" PRIxPTR " len %zu exists in btree", 335 entry->addr, entry->len); 336 return 0; 337 } 338 339 if (bt->len >= bt->size) { 340 DP_LOG(ERR, "Btree overflow detected len %u size %u", 341 bt->len, bt->size); 342 bt->overflow = 1; 343 return -1; 344 } 345 346 table = bt->table; 347 348 idx++; 349 shift = (bt->len - idx) * sizeof(struct mana_mr_cache); 350 if (shift) { 351 DP_LOG(DEBUG, "Moving %u bytes from idx %u to %u", 352 shift, idx, idx + 1); 353 memmove(&table[idx + 1], &table[idx], shift); 354 } 355 356 table[idx] = *entry; 357 bt->len++; 358 359 DP_LOG(DEBUG, 360 "Inserted MR b-tree table %p idx %d addr 0x%" PRIxPTR " len %zu", 361 table, idx, entry->addr, entry->len); 362 363 return 0; 364 } 365