1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2022 Microsoft Corporation 3 */ 4 5 #include <rte_malloc.h> 6 #include <ethdev_driver.h> 7 #include <rte_eal_paging.h> 8 9 #include <infiniband/verbs.h> 10 11 #include "mana.h" 12 13 struct mana_range { 14 uintptr_t start; 15 uintptr_t end; 16 uint32_t len; 17 }; 18 19 void 20 mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque, 21 struct rte_mempool_memhdr *memhdr, unsigned int idx) 22 { 23 struct mana_range *ranges = opaque; 24 struct mana_range *range = &ranges[idx]; 25 uint64_t page_size = rte_mem_page_size(); 26 27 range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size); 28 range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len, 29 page_size); 30 range->len = range->end - range->start; 31 } 32 33 /* 34 * Register all memory regions from pool. 35 */ 36 int 37 mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv, 38 struct rte_mempool *pool) 39 { 40 struct ibv_mr *ibv_mr; 41 struct mana_range ranges[pool->nb_mem_chunks]; 42 uint32_t i; 43 struct mana_mr_cache *mr; 44 int ret; 45 46 rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges); 47 48 for (i = 0; i < pool->nb_mem_chunks; i++) { 49 if (ranges[i].len > priv->max_mr_size) { 50 DP_LOG(ERR, "memory chunk size %u exceeding max MR", 51 ranges[i].len); 52 return -ENOMEM; 53 } 54 55 DP_LOG(DEBUG, 56 "registering memory chunk start 0x%" PRIxPTR " len %u", 57 ranges[i].start, ranges[i].len); 58 59 if (rte_eal_process_type() == RTE_PROC_SECONDARY) { 60 /* Send a message to the primary to do MR */ 61 ret = mana_mp_req_mr_create(priv, ranges[i].start, 62 ranges[i].len); 63 if (ret) { 64 DP_LOG(ERR, 65 "MR failed start 0x%" PRIxPTR " len %u", 66 ranges[i].start, ranges[i].len); 67 return ret; 68 } 69 continue; 70 } 71 72 ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start, 73 ranges[i].len, IBV_ACCESS_LOCAL_WRITE); 74 if (ibv_mr) { 75 DP_LOG(DEBUG, "MR lkey %u addr %p len %zu", 76 ibv_mr->lkey, ibv_mr->addr, ibv_mr->length); 77 78 mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0); 79 mr->lkey = ibv_mr->lkey; 80 mr->addr = (uintptr_t)ibv_mr->addr; 81 mr->len = ibv_mr->length; 82 mr->verb_obj = ibv_mr; 83 84 rte_spinlock_lock(&priv->mr_btree_lock); 85 ret = mana_mr_btree_insert(&priv->mr_btree, mr); 86 rte_spinlock_unlock(&priv->mr_btree_lock); 87 if (ret) { 88 ibv_dereg_mr(ibv_mr); 89 DP_LOG(ERR, "Failed to add to global MR btree"); 90 return ret; 91 } 92 93 ret = mana_mr_btree_insert(local_tree, mr); 94 if (ret) { 95 /* Don't need to clean up MR as it's already 96 * in the global tree 97 */ 98 DP_LOG(ERR, "Failed to add to local MR btree"); 99 return ret; 100 } 101 } else { 102 DP_LOG(ERR, "MR failed at 0x%" PRIxPTR " len %u", 103 ranges[i].start, ranges[i].len); 104 return -errno; 105 } 106 } 107 return 0; 108 } 109 110 /* 111 * Deregister a MR. 112 */ 113 void 114 mana_del_pmd_mr(struct mana_mr_cache *mr) 115 { 116 int ret; 117 struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj; 118 119 ret = ibv_dereg_mr(ibv_mr); 120 if (ret) 121 DP_LOG(ERR, "dereg MR failed ret %d", ret); 122 } 123 124 /* 125 * Alloc a MR. 126 * Try to find a MR in the cache. If not found, register a new MR. 127 */ 128 struct mana_mr_cache * 129 mana_alloc_pmd_mr(struct mana_mr_btree *local_mr_btree, struct mana_priv *priv, 130 struct rte_mbuf *mbuf) 131 { 132 struct rte_mempool *pool = mbuf->pool; 133 int ret, second_try = 0; 134 struct mana_mr_cache *mr; 135 uint16_t idx; 136 137 DP_LOG(DEBUG, "finding mr for mbuf addr %p len %d", 138 mbuf->buf_addr, mbuf->buf_len); 139 140 try_again: 141 /* First try to find the MR in local queue tree */ 142 mr = mana_mr_btree_lookup(local_mr_btree, &idx, 143 (uintptr_t)mbuf->buf_addr, mbuf->buf_len); 144 if (mr) { 145 DP_LOG(DEBUG, "Local mr lkey %u addr 0x%" PRIxPTR " len %zu", 146 mr->lkey, mr->addr, mr->len); 147 return mr; 148 } 149 150 /* If not found, try to find the MR in global tree */ 151 rte_spinlock_lock(&priv->mr_btree_lock); 152 mr = mana_mr_btree_lookup(&priv->mr_btree, &idx, 153 (uintptr_t)mbuf->buf_addr, 154 mbuf->buf_len); 155 rte_spinlock_unlock(&priv->mr_btree_lock); 156 157 /* If found in the global tree, add it to the local tree */ 158 if (mr) { 159 ret = mana_mr_btree_insert(local_mr_btree, mr); 160 if (ret) { 161 DP_LOG(ERR, "Failed to add MR to local tree."); 162 return NULL; 163 } 164 165 DP_LOG(DEBUG, 166 "Added local MR key %u addr 0x%" PRIxPTR " len %zu", 167 mr->lkey, mr->addr, mr->len); 168 return mr; 169 } 170 171 if (second_try) { 172 DP_LOG(ERR, "Internal error second try failed"); 173 return NULL; 174 } 175 176 ret = mana_new_pmd_mr(local_mr_btree, priv, pool); 177 if (ret) { 178 DP_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d", 179 ret, mbuf->buf_addr, mbuf->buf_len); 180 return NULL; 181 } 182 183 second_try = 1; 184 goto try_again; 185 } 186 187 void 188 mana_remove_all_mr(struct mana_priv *priv) 189 { 190 struct mana_mr_btree *bt = &priv->mr_btree; 191 struct mana_mr_cache *mr; 192 struct ibv_mr *ibv_mr; 193 uint16_t i; 194 195 rte_spinlock_lock(&priv->mr_btree_lock); 196 /* Start with index 1 as the 1st entry is always NULL */ 197 for (i = 1; i < bt->len; i++) { 198 mr = &bt->table[i]; 199 ibv_mr = mr->verb_obj; 200 ibv_dereg_mr(ibv_mr); 201 } 202 bt->len = 1; 203 rte_spinlock_unlock(&priv->mr_btree_lock); 204 } 205 206 /* 207 * Expand the MR cache. 208 * MR cache is maintained as a btree and expand on demand. 209 */ 210 static int 211 mana_mr_btree_expand(struct mana_mr_btree *bt, int n) 212 { 213 void *mem; 214 215 mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache), 216 0, bt->socket); 217 if (!mem) { 218 DP_LOG(ERR, "Failed to expand btree size %d", n); 219 return -1; 220 } 221 222 DP_LOG(ERR, "Expanded btree to size %d", n); 223 bt->table = mem; 224 bt->size = n; 225 226 return 0; 227 } 228 229 /* 230 * Look for a region of memory in MR cache. 231 */ 232 struct mana_mr_cache * 233 mana_mr_btree_lookup(struct mana_mr_btree *bt, uint16_t *idx, 234 uintptr_t addr, size_t len) 235 { 236 struct mana_mr_cache *table; 237 uint16_t n; 238 uint16_t base = 0; 239 int ret; 240 241 n = bt->len; 242 243 /* Try to double the cache if it's full */ 244 if (n == bt->size) { 245 ret = mana_mr_btree_expand(bt, bt->size << 1); 246 if (ret) 247 return NULL; 248 } 249 250 table = bt->table; 251 252 /* Do binary search on addr */ 253 do { 254 uint16_t delta = n >> 1; 255 256 if (addr < table[base + delta].addr) { 257 n = delta; 258 } else { 259 base += delta; 260 n -= delta; 261 } 262 } while (n > 1); 263 264 *idx = base; 265 266 if (addr + len <= table[base].addr + table[base].len) 267 return &table[base]; 268 269 DP_LOG(DEBUG, 270 "addr 0x%" PRIxPTR " len %zu idx %u sum 0x%" PRIxPTR " not found", 271 addr, len, *idx, addr + len); 272 273 return NULL; 274 } 275 276 int 277 mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket) 278 { 279 memset(bt, 0, sizeof(*bt)); 280 bt->table = rte_calloc_socket("MANA B-tree table", 281 n, 282 sizeof(struct mana_mr_cache), 283 0, socket); 284 if (!bt->table) { 285 DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d", 286 n, socket); 287 return -ENOMEM; 288 } 289 290 bt->socket = socket; 291 bt->size = n; 292 293 /* First entry must be NULL for binary search to work */ 294 bt->table[0] = (struct mana_mr_cache) { 295 .lkey = UINT32_MAX, 296 }; 297 bt->len = 1; 298 299 DRV_LOG(ERR, "B-tree initialized table %p size %d len %d", 300 bt->table, n, bt->len); 301 302 return 0; 303 } 304 305 void 306 mana_mr_btree_free(struct mana_mr_btree *bt) 307 { 308 rte_free(bt->table); 309 memset(bt, 0, sizeof(*bt)); 310 } 311 312 int 313 mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry) 314 { 315 struct mana_mr_cache *table; 316 uint16_t idx = 0; 317 uint16_t shift; 318 319 if (mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len)) { 320 DP_LOG(DEBUG, "Addr 0x%" PRIxPTR " len %zu exists in btree", 321 entry->addr, entry->len); 322 return 0; 323 } 324 325 if (bt->len >= bt->size) { 326 bt->overflow = 1; 327 return -1; 328 } 329 330 table = bt->table; 331 332 idx++; 333 shift = (bt->len - idx) * sizeof(struct mana_mr_cache); 334 if (shift) { 335 DP_LOG(DEBUG, "Moving %u bytes from idx %u to %u", 336 shift, idx, idx + 1); 337 memmove(&table[idx + 1], &table[idx], shift); 338 } 339 340 table[idx] = *entry; 341 bt->len++; 342 343 DP_LOG(DEBUG, 344 "Inserted MR b-tree table %p idx %d addr 0x%" PRIxPTR " len %zu", 345 table, idx, entry->addr, entry->len); 346 347 return 0; 348 } 349