1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "env_internal.h" 37 38 #include <rte_config.h> 39 #include <rte_eal_memconfig.h> 40 41 #include "spdk_internal/assert.h" 42 43 #include "spdk/assert.h" 44 #include "spdk/likely.h" 45 #include "spdk/queue.h" 46 #include "spdk/util.h" 47 48 #if DEBUG 49 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) 50 #else 51 #define DEBUG_PRINT(...) 52 #endif 53 54 #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) 55 #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) 56 57 #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) 58 #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB + 1)) - 1)) 59 60 /* Translation of a single 2MB page. */ 61 struct map_2mb { 62 uint64_t translation_2mb; 63 }; 64 65 /* Second-level map table indexed by bits [21..29] of the virtual address. 66 * Each entry contains the address translation or error for entries that haven't 67 * been retrieved yet. 68 */ 69 struct map_1gb { 70 struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB + 1)]; 71 }; 72 73 /* Top-level map table indexed by bits [30..46] of the virtual address. 74 * Each entry points to a second-level map table or NULL. 75 */ 76 struct map_256tb { 77 struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB + 1)]; 78 }; 79 80 /* Page-granularity memory address translation */ 81 struct spdk_mem_map { 82 struct map_256tb map_256tb; 83 pthread_mutex_t mutex; 84 uint64_t default_translation; 85 spdk_mem_map_notify_cb notify_cb; 86 void *cb_ctx; 87 TAILQ_ENTRY(spdk_mem_map) tailq; 88 }; 89 90 static struct spdk_mem_map *g_mem_reg_map; 91 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); 92 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; 93 94 /* 95 * Walk the currently registered memory via the main memory registration map 96 * and call the new map's notify callback for each virtually contiguous region. 97 */ 98 static void 99 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) 100 { 101 size_t idx_256tb; 102 uint64_t contig_start = 0; 103 uint64_t contig_end = 0; 104 105 #define END_RANGE() \ 106 do { \ 107 if (contig_start != 0) { \ 108 /* End of of a virtually contiguous range */ \ 109 map->notify_cb(map->cb_ctx, map, action, \ 110 (void *)contig_start, \ 111 contig_end - contig_start + 2 * 1024 * 1024); \ 112 } \ 113 contig_start = 0; \ 114 } while (0) 115 116 117 if (!g_mem_reg_map) { 118 return; 119 } 120 121 /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ 122 pthread_mutex_lock(&g_mem_reg_map->mutex); 123 124 for (idx_256tb = 0; 125 idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); 126 idx_256tb++) { 127 const struct map_1gb *map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; 128 uint64_t idx_1gb; 129 130 if (!map_1gb) { 131 END_RANGE(); 132 continue; 133 } 134 135 for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { 136 if (map_1gb->map[idx_1gb].translation_2mb != 0) { 137 /* Rebuild the virtual address from the indexes */ 138 uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); 139 140 if (contig_start == 0) { 141 contig_start = vaddr; 142 } 143 contig_end = vaddr; 144 } else { 145 END_RANGE(); 146 } 147 } 148 } 149 150 pthread_mutex_unlock(&g_mem_reg_map->mutex); 151 } 152 153 struct spdk_mem_map * 154 spdk_mem_map_alloc(uint64_t default_translation, spdk_mem_map_notify_cb notify_cb, void *cb_ctx) 155 { 156 struct spdk_mem_map *map; 157 158 map = calloc(1, sizeof(*map)); 159 if (map == NULL) { 160 return NULL; 161 } 162 163 if (pthread_mutex_init(&map->mutex, NULL)) { 164 free(map); 165 return NULL; 166 } 167 168 map->default_translation = default_translation; 169 map->notify_cb = notify_cb; 170 map->cb_ctx = cb_ctx; 171 172 pthread_mutex_lock(&g_spdk_mem_map_mutex); 173 174 if (notify_cb) { 175 spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); 176 TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); 177 } 178 179 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 180 181 return map; 182 } 183 184 void 185 spdk_mem_map_free(struct spdk_mem_map **pmap) 186 { 187 struct spdk_mem_map *map; 188 size_t i; 189 190 if (!pmap) { 191 return; 192 } 193 194 map = *pmap; 195 196 if (!map) { 197 return; 198 } 199 200 pthread_mutex_lock(&g_spdk_mem_map_mutex); 201 spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); 202 TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); 203 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 204 205 for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { 206 free(map->map_256tb.map[i]); 207 } 208 209 pthread_mutex_destroy(&map->mutex); 210 211 free(map); 212 *pmap = NULL; 213 } 214 215 int 216 spdk_mem_register(void *vaddr, size_t len) 217 { 218 struct spdk_mem_map *map; 219 int rc; 220 void *seg_vaddr; 221 size_t seg_len; 222 223 if ((uintptr_t)vaddr & ~MASK_256TB) { 224 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 225 return -EINVAL; 226 } 227 228 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 229 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 230 __func__, vaddr, len); 231 return -EINVAL; 232 } 233 234 pthread_mutex_lock(&g_spdk_mem_map_mutex); 235 236 seg_vaddr = vaddr; 237 seg_len = 0; 238 while (len > 0) { 239 uint64_t ref_count; 240 241 /* In g_mem_reg_map, the "translation" is the reference count */ 242 ref_count = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr); 243 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, ref_count + 1); 244 245 if (ref_count > 0) { 246 if (seg_len > 0) { 247 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 248 rc = map->notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 249 if (rc != 0) { 250 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 251 return rc; 252 } 253 } 254 } 255 256 seg_vaddr = vaddr + VALUE_2MB; 257 seg_len = 0; 258 } else { 259 seg_len += VALUE_2MB; 260 } 261 262 vaddr += VALUE_2MB; 263 len -= VALUE_2MB; 264 } 265 266 if (seg_len > 0) { 267 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 268 rc = map->notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); 269 if (rc != 0) { 270 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 271 return rc; 272 } 273 } 274 } 275 276 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 277 return 0; 278 } 279 280 int 281 spdk_mem_unregister(void *vaddr, size_t len) 282 { 283 struct spdk_mem_map *map; 284 int rc; 285 void *seg_vaddr; 286 size_t seg_len; 287 uint64_t ref_count; 288 289 if ((uintptr_t)vaddr & ~MASK_256TB) { 290 DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); 291 return -EINVAL; 292 } 293 294 if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { 295 DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", 296 __func__, vaddr, len); 297 return -EINVAL; 298 } 299 300 pthread_mutex_lock(&g_spdk_mem_map_mutex); 301 302 seg_vaddr = vaddr; 303 seg_len = len; 304 while (seg_len > 0) { 305 ref_count = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr); 306 if (ref_count == 0) { 307 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 308 return -EINVAL; 309 } 310 seg_vaddr += VALUE_2MB; 311 seg_len -= VALUE_2MB; 312 } 313 314 seg_vaddr = vaddr; 315 seg_len = 0; 316 while (len > 0) { 317 /* In g_mem_reg_map, the "translation" is the reference count */ 318 ref_count = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr); 319 spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, ref_count - 1); 320 321 if (ref_count > 1) { 322 if (seg_len > 0) { 323 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 324 rc = map->notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 325 if (rc != 0) { 326 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 327 return rc; 328 } 329 } 330 } 331 332 seg_vaddr = vaddr + VALUE_2MB; 333 seg_len = 0; 334 } else { 335 seg_len += VALUE_2MB; 336 } 337 338 vaddr += VALUE_2MB; 339 len -= VALUE_2MB; 340 } 341 342 if (seg_len > 0) { 343 TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { 344 rc = map->notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); 345 if (rc != 0) { 346 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 347 return rc; 348 } 349 } 350 } 351 352 pthread_mutex_unlock(&g_spdk_mem_map_mutex); 353 return 0; 354 } 355 356 static struct map_1gb * 357 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) 358 { 359 struct map_1gb *map_1gb; 360 uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); 361 size_t i; 362 363 map_1gb = map->map_256tb.map[idx_256tb]; 364 365 if (!map_1gb) { 366 pthread_mutex_lock(&map->mutex); 367 368 /* Recheck to make sure nobody else got the mutex first. */ 369 map_1gb = map->map_256tb.map[idx_256tb]; 370 if (!map_1gb) { 371 map_1gb = malloc(sizeof(struct map_1gb)); 372 if (map_1gb) { 373 /* initialize all entries to default translation */ 374 for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { 375 map_1gb->map[i].translation_2mb = map->default_translation; 376 } 377 map->map_256tb.map[idx_256tb] = map_1gb; 378 } 379 } 380 381 pthread_mutex_unlock(&map->mutex); 382 383 if (!map_1gb) { 384 DEBUG_PRINT("allocation failed\n"); 385 return NULL; 386 } 387 } 388 389 return map_1gb; 390 } 391 392 int 393 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, 394 uint64_t translation) 395 { 396 uint64_t vfn_2mb; 397 struct map_1gb *map_1gb; 398 uint64_t idx_1gb; 399 struct map_2mb *map_2mb; 400 401 /* For now, only 2 MB-aligned registrations are supported */ 402 if ((uintptr_t)vaddr & ~MASK_256TB) { 403 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 404 return -EINVAL; 405 } 406 407 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 408 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 409 __func__, vaddr, size); 410 return -EINVAL; 411 } 412 413 vfn_2mb = vaddr >> SHIFT_2MB; 414 415 while (size) { 416 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 417 if (!map_1gb) { 418 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 419 return -ENOMEM; 420 } 421 422 idx_1gb = MAP_1GB_IDX(vfn_2mb); 423 map_2mb = &map_1gb->map[idx_1gb]; 424 map_2mb->translation_2mb = translation; 425 426 size -= VALUE_2MB; 427 vfn_2mb++; 428 } 429 430 return 0; 431 } 432 433 int 434 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) 435 { 436 uint64_t vfn_2mb; 437 struct map_1gb *map_1gb; 438 uint64_t idx_1gb; 439 struct map_2mb *map_2mb; 440 441 /* For now, only 2 MB-aligned registrations are supported */ 442 if ((uintptr_t)vaddr & ~MASK_256TB) { 443 DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); 444 return -EINVAL; 445 } 446 447 if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { 448 DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", 449 __func__, vaddr, size); 450 return -EINVAL; 451 } 452 453 vfn_2mb = vaddr >> SHIFT_2MB; 454 455 while (size) { 456 map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); 457 if (!map_1gb) { 458 DEBUG_PRINT("could not get %p map\n", (void *)vaddr); 459 return -ENOMEM; 460 } 461 462 idx_1gb = MAP_1GB_IDX(vfn_2mb); 463 map_2mb = &map_1gb->map[idx_1gb]; 464 map_2mb->translation_2mb = map->default_translation; 465 466 size -= VALUE_2MB; 467 vfn_2mb++; 468 } 469 470 return 0; 471 } 472 473 uint64_t 474 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr) 475 { 476 const struct map_1gb *map_1gb; 477 const struct map_2mb *map_2mb; 478 uint64_t idx_256tb; 479 uint64_t idx_1gb; 480 uint64_t vfn_2mb; 481 482 if (spdk_unlikely(vaddr & ~MASK_256TB)) { 483 DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); 484 return map->default_translation; 485 } 486 487 vfn_2mb = vaddr >> SHIFT_2MB; 488 idx_256tb = MAP_256TB_IDX(vfn_2mb); 489 idx_1gb = MAP_1GB_IDX(vfn_2mb); 490 491 map_1gb = map->map_256tb.map[idx_256tb]; 492 if (spdk_unlikely(!map_1gb)) { 493 return map->default_translation; 494 } 495 496 map_2mb = &map_1gb->map[idx_1gb]; 497 498 return map_2mb->translation_2mb; 499 } 500 501 int 502 spdk_mem_map_init(void) 503 { 504 struct rte_mem_config *mcfg; 505 size_t seg_idx; 506 507 g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); 508 if (g_mem_reg_map == NULL) { 509 DEBUG_PRINT("memory registration map allocation failed\n"); 510 return -1; 511 } 512 513 /* 514 * Walk all DPDK memory segments and register them 515 * with the master memory map 516 */ 517 mcfg = rte_eal_get_configuration()->mem_config; 518 519 for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { 520 struct rte_memseg *seg = &mcfg->memseg[seg_idx]; 521 522 if (seg->addr == NULL) { 523 break; 524 } 525 526 spdk_mem_register(seg->addr, seg->len); 527 } 528 return 0; 529 } 530