1*99a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 2*99a2dd95SBruce Richardson * Copyright(c) 2010-2014 Intel Corporation 3*99a2dd95SBruce Richardson */ 4*99a2dd95SBruce Richardson #include <stdint.h> 5*99a2dd95SBruce Richardson #include <stddef.h> 6*99a2dd95SBruce Richardson #include <stdlib.h> 7*99a2dd95SBruce Richardson #include <stdio.h> 8*99a2dd95SBruce Richardson #include <stdarg.h> 9*99a2dd95SBruce Richardson #include <errno.h> 10*99a2dd95SBruce Richardson #include <sys/queue.h> 11*99a2dd95SBruce Richardson 12*99a2dd95SBruce Richardson #include <rte_memory.h> 13*99a2dd95SBruce Richardson #include <rte_errno.h> 14*99a2dd95SBruce Richardson #include <rte_eal.h> 15*99a2dd95SBruce Richardson #include <rte_eal_memconfig.h> 16*99a2dd95SBruce Richardson #include <rte_launch.h> 17*99a2dd95SBruce Richardson #include <rte_per_lcore.h> 18*99a2dd95SBruce Richardson #include <rte_lcore.h> 19*99a2dd95SBruce Richardson #include <rte_common.h> 20*99a2dd95SBruce Richardson #include <rte_string_fns.h> 21*99a2dd95SBruce Richardson #include <rte_spinlock.h> 22*99a2dd95SBruce Richardson #include <rte_memcpy.h> 23*99a2dd95SBruce Richardson #include <rte_memzone.h> 24*99a2dd95SBruce Richardson #include <rte_atomic.h> 25*99a2dd95SBruce Richardson #include <rte_fbarray.h> 26*99a2dd95SBruce Richardson 27*99a2dd95SBruce Richardson #include "eal_internal_cfg.h" 28*99a2dd95SBruce Richardson #include "eal_memalloc.h" 29*99a2dd95SBruce Richardson #include "eal_memcfg.h" 30*99a2dd95SBruce Richardson #include "eal_private.h" 31*99a2dd95SBruce Richardson #include "malloc_elem.h" 32*99a2dd95SBruce Richardson #include "malloc_heap.h" 33*99a2dd95SBruce Richardson #include "malloc_mp.h" 34*99a2dd95SBruce Richardson 35*99a2dd95SBruce Richardson /* start external socket ID's at a very high number */ 36*99a2dd95SBruce Richardson #define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */ 37*99a2dd95SBruce Richardson #define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES)) 38*99a2dd95SBruce Richardson 39*99a2dd95SBruce Richardson static unsigned 40*99a2dd95SBruce Richardson check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) 41*99a2dd95SBruce Richardson { 42*99a2dd95SBruce Richardson unsigned check_flag = 0; 43*99a2dd95SBruce Richardson 44*99a2dd95SBruce Richardson if (!(flags & ~RTE_MEMZONE_SIZE_HINT_ONLY)) 45*99a2dd95SBruce Richardson return 1; 46*99a2dd95SBruce Richardson 47*99a2dd95SBruce Richardson switch (hugepage_sz) { 48*99a2dd95SBruce Richardson case RTE_PGSIZE_256K: 49*99a2dd95SBruce Richardson check_flag = RTE_MEMZONE_256KB; 50*99a2dd95SBruce Richardson break; 51*99a2dd95SBruce Richardson case RTE_PGSIZE_2M: 52*99a2dd95SBruce Richardson check_flag = RTE_MEMZONE_2MB; 53*99a2dd95SBruce Richardson break; 54*99a2dd95SBruce Richardson case RTE_PGSIZE_16M: 55*99a2dd95SBruce Richardson check_flag = RTE_MEMZONE_16MB; 56*99a2dd95SBruce Richardson break; 57*99a2dd95SBruce Richardson case RTE_PGSIZE_256M: 58*99a2dd95SBruce Richardson check_flag = RTE_MEMZONE_256MB; 59*99a2dd95SBruce Richardson break; 60*99a2dd95SBruce Richardson case RTE_PGSIZE_512M: 61*99a2dd95SBruce Richardson check_flag = RTE_MEMZONE_512MB; 62*99a2dd95SBruce Richardson break; 63*99a2dd95SBruce Richardson case RTE_PGSIZE_1G: 64*99a2dd95SBruce Richardson check_flag = RTE_MEMZONE_1GB; 65*99a2dd95SBruce Richardson break; 66*99a2dd95SBruce Richardson case RTE_PGSIZE_4G: 67*99a2dd95SBruce Richardson check_flag = RTE_MEMZONE_4GB; 68*99a2dd95SBruce Richardson break; 69*99a2dd95SBruce Richardson case RTE_PGSIZE_16G: 70*99a2dd95SBruce Richardson check_flag = RTE_MEMZONE_16GB; 71*99a2dd95SBruce Richardson } 72*99a2dd95SBruce Richardson 73*99a2dd95SBruce Richardson return check_flag & flags; 74*99a2dd95SBruce Richardson } 75*99a2dd95SBruce Richardson 76*99a2dd95SBruce Richardson int 77*99a2dd95SBruce Richardson malloc_socket_to_heap_id(unsigned int socket_id) 78*99a2dd95SBruce Richardson { 79*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 80*99a2dd95SBruce Richardson int i; 81*99a2dd95SBruce Richardson 82*99a2dd95SBruce Richardson for (i = 0; i < RTE_MAX_HEAPS; i++) { 83*99a2dd95SBruce Richardson struct malloc_heap *heap = &mcfg->malloc_heaps[i]; 84*99a2dd95SBruce Richardson 85*99a2dd95SBruce Richardson if (heap->socket_id == socket_id) 86*99a2dd95SBruce Richardson return i; 87*99a2dd95SBruce Richardson } 88*99a2dd95SBruce Richardson return -1; 89*99a2dd95SBruce Richardson } 90*99a2dd95SBruce Richardson 91*99a2dd95SBruce Richardson /* 92*99a2dd95SBruce Richardson * Expand the heap with a memory area. 93*99a2dd95SBruce Richardson */ 94*99a2dd95SBruce Richardson static struct malloc_elem * 95*99a2dd95SBruce Richardson malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl, 96*99a2dd95SBruce Richardson void *start, size_t len) 97*99a2dd95SBruce Richardson { 98*99a2dd95SBruce Richardson struct malloc_elem *elem = start; 99*99a2dd95SBruce Richardson 100*99a2dd95SBruce Richardson malloc_elem_init(elem, heap, msl, len, elem, len); 101*99a2dd95SBruce Richardson 102*99a2dd95SBruce Richardson malloc_elem_insert(elem); 103*99a2dd95SBruce Richardson 104*99a2dd95SBruce Richardson elem = malloc_elem_join_adjacent_free(elem); 105*99a2dd95SBruce Richardson 106*99a2dd95SBruce Richardson malloc_elem_free_list_insert(elem); 107*99a2dd95SBruce Richardson 108*99a2dd95SBruce Richardson return elem; 109*99a2dd95SBruce Richardson } 110*99a2dd95SBruce Richardson 111*99a2dd95SBruce Richardson static int 112*99a2dd95SBruce Richardson malloc_add_seg(const struct rte_memseg_list *msl, 113*99a2dd95SBruce Richardson const struct rte_memseg *ms, size_t len, void *arg __rte_unused) 114*99a2dd95SBruce Richardson { 115*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 116*99a2dd95SBruce Richardson struct rte_memseg_list *found_msl; 117*99a2dd95SBruce Richardson struct malloc_heap *heap; 118*99a2dd95SBruce Richardson int msl_idx, heap_idx; 119*99a2dd95SBruce Richardson 120*99a2dd95SBruce Richardson if (msl->external) 121*99a2dd95SBruce Richardson return 0; 122*99a2dd95SBruce Richardson 123*99a2dd95SBruce Richardson heap_idx = malloc_socket_to_heap_id(msl->socket_id); 124*99a2dd95SBruce Richardson if (heap_idx < 0) { 125*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n"); 126*99a2dd95SBruce Richardson return -1; 127*99a2dd95SBruce Richardson } 128*99a2dd95SBruce Richardson heap = &mcfg->malloc_heaps[heap_idx]; 129*99a2dd95SBruce Richardson 130*99a2dd95SBruce Richardson /* msl is const, so find it */ 131*99a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs; 132*99a2dd95SBruce Richardson 133*99a2dd95SBruce Richardson if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) 134*99a2dd95SBruce Richardson return -1; 135*99a2dd95SBruce Richardson 136*99a2dd95SBruce Richardson found_msl = &mcfg->memsegs[msl_idx]; 137*99a2dd95SBruce Richardson 138*99a2dd95SBruce Richardson malloc_heap_add_memory(heap, found_msl, ms->addr, len); 139*99a2dd95SBruce Richardson 140*99a2dd95SBruce Richardson heap->total_size += len; 141*99a2dd95SBruce Richardson 142*99a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20, 143*99a2dd95SBruce Richardson msl->socket_id); 144*99a2dd95SBruce Richardson return 0; 145*99a2dd95SBruce Richardson } 146*99a2dd95SBruce Richardson 147*99a2dd95SBruce Richardson /* 148*99a2dd95SBruce Richardson * Iterates through the freelist for a heap to find a free element 149*99a2dd95SBruce Richardson * which can store data of the required size and with the requested alignment. 150*99a2dd95SBruce Richardson * If size is 0, find the biggest available elem. 151*99a2dd95SBruce Richardson * Returns null on failure, or pointer to element on success. 152*99a2dd95SBruce Richardson */ 153*99a2dd95SBruce Richardson static struct malloc_elem * 154*99a2dd95SBruce Richardson find_suitable_element(struct malloc_heap *heap, size_t size, 155*99a2dd95SBruce Richardson unsigned int flags, size_t align, size_t bound, bool contig) 156*99a2dd95SBruce Richardson { 157*99a2dd95SBruce Richardson size_t idx; 158*99a2dd95SBruce Richardson struct malloc_elem *elem, *alt_elem = NULL; 159*99a2dd95SBruce Richardson 160*99a2dd95SBruce Richardson for (idx = malloc_elem_free_list_index(size); 161*99a2dd95SBruce Richardson idx < RTE_HEAP_NUM_FREELISTS; idx++) { 162*99a2dd95SBruce Richardson for (elem = LIST_FIRST(&heap->free_head[idx]); 163*99a2dd95SBruce Richardson !!elem; elem = LIST_NEXT(elem, free_list)) { 164*99a2dd95SBruce Richardson if (malloc_elem_can_hold(elem, size, align, bound, 165*99a2dd95SBruce Richardson contig)) { 166*99a2dd95SBruce Richardson if (check_hugepage_sz(flags, 167*99a2dd95SBruce Richardson elem->msl->page_sz)) 168*99a2dd95SBruce Richardson return elem; 169*99a2dd95SBruce Richardson if (alt_elem == NULL) 170*99a2dd95SBruce Richardson alt_elem = elem; 171*99a2dd95SBruce Richardson } 172*99a2dd95SBruce Richardson } 173*99a2dd95SBruce Richardson } 174*99a2dd95SBruce Richardson 175*99a2dd95SBruce Richardson if ((alt_elem != NULL) && (flags & RTE_MEMZONE_SIZE_HINT_ONLY)) 176*99a2dd95SBruce Richardson return alt_elem; 177*99a2dd95SBruce Richardson 178*99a2dd95SBruce Richardson return NULL; 179*99a2dd95SBruce Richardson } 180*99a2dd95SBruce Richardson 181*99a2dd95SBruce Richardson /* 182*99a2dd95SBruce Richardson * Iterates through the freelist for a heap to find a free element with the 183*99a2dd95SBruce Richardson * biggest size and requested alignment. Will also set size to whatever element 184*99a2dd95SBruce Richardson * size that was found. 185*99a2dd95SBruce Richardson * Returns null on failure, or pointer to element on success. 186*99a2dd95SBruce Richardson */ 187*99a2dd95SBruce Richardson static struct malloc_elem * 188*99a2dd95SBruce Richardson find_biggest_element(struct malloc_heap *heap, size_t *size, 189*99a2dd95SBruce Richardson unsigned int flags, size_t align, bool contig) 190*99a2dd95SBruce Richardson { 191*99a2dd95SBruce Richardson struct malloc_elem *elem, *max_elem = NULL; 192*99a2dd95SBruce Richardson size_t idx, max_size = 0; 193*99a2dd95SBruce Richardson 194*99a2dd95SBruce Richardson for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { 195*99a2dd95SBruce Richardson for (elem = LIST_FIRST(&heap->free_head[idx]); 196*99a2dd95SBruce Richardson !!elem; elem = LIST_NEXT(elem, free_list)) { 197*99a2dd95SBruce Richardson size_t cur_size; 198*99a2dd95SBruce Richardson if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && 199*99a2dd95SBruce Richardson !check_hugepage_sz(flags, 200*99a2dd95SBruce Richardson elem->msl->page_sz)) 201*99a2dd95SBruce Richardson continue; 202*99a2dd95SBruce Richardson if (contig) { 203*99a2dd95SBruce Richardson cur_size = 204*99a2dd95SBruce Richardson malloc_elem_find_max_iova_contig(elem, 205*99a2dd95SBruce Richardson align); 206*99a2dd95SBruce Richardson } else { 207*99a2dd95SBruce Richardson void *data_start = RTE_PTR_ADD(elem, 208*99a2dd95SBruce Richardson MALLOC_ELEM_HEADER_LEN); 209*99a2dd95SBruce Richardson void *data_end = RTE_PTR_ADD(elem, elem->size - 210*99a2dd95SBruce Richardson MALLOC_ELEM_TRAILER_LEN); 211*99a2dd95SBruce Richardson void *aligned = RTE_PTR_ALIGN_CEIL(data_start, 212*99a2dd95SBruce Richardson align); 213*99a2dd95SBruce Richardson /* check if aligned data start is beyond end */ 214*99a2dd95SBruce Richardson if (aligned >= data_end) 215*99a2dd95SBruce Richardson continue; 216*99a2dd95SBruce Richardson cur_size = RTE_PTR_DIFF(data_end, aligned); 217*99a2dd95SBruce Richardson } 218*99a2dd95SBruce Richardson if (cur_size > max_size) { 219*99a2dd95SBruce Richardson max_size = cur_size; 220*99a2dd95SBruce Richardson max_elem = elem; 221*99a2dd95SBruce Richardson } 222*99a2dd95SBruce Richardson } 223*99a2dd95SBruce Richardson } 224*99a2dd95SBruce Richardson 225*99a2dd95SBruce Richardson *size = max_size; 226*99a2dd95SBruce Richardson return max_elem; 227*99a2dd95SBruce Richardson } 228*99a2dd95SBruce Richardson 229*99a2dd95SBruce Richardson /* 230*99a2dd95SBruce Richardson * Main function to allocate a block of memory from the heap. 231*99a2dd95SBruce Richardson * It locks the free list, scans it, and adds a new memseg if the 232*99a2dd95SBruce Richardson * scan fails. Once the new memseg is added, it re-scans and should return 233*99a2dd95SBruce Richardson * the new element after releasing the lock. 234*99a2dd95SBruce Richardson */ 235*99a2dd95SBruce Richardson static void * 236*99a2dd95SBruce Richardson heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, 237*99a2dd95SBruce Richardson unsigned int flags, size_t align, size_t bound, bool contig) 238*99a2dd95SBruce Richardson { 239*99a2dd95SBruce Richardson struct malloc_elem *elem; 240*99a2dd95SBruce Richardson 241*99a2dd95SBruce Richardson size = RTE_CACHE_LINE_ROUNDUP(size); 242*99a2dd95SBruce Richardson align = RTE_CACHE_LINE_ROUNDUP(align); 243*99a2dd95SBruce Richardson 244*99a2dd95SBruce Richardson /* roundup might cause an overflow */ 245*99a2dd95SBruce Richardson if (size == 0) 246*99a2dd95SBruce Richardson return NULL; 247*99a2dd95SBruce Richardson elem = find_suitable_element(heap, size, flags, align, bound, contig); 248*99a2dd95SBruce Richardson if (elem != NULL) { 249*99a2dd95SBruce Richardson elem = malloc_elem_alloc(elem, size, align, bound, contig); 250*99a2dd95SBruce Richardson 251*99a2dd95SBruce Richardson /* increase heap's count of allocated elements */ 252*99a2dd95SBruce Richardson heap->alloc_count++; 253*99a2dd95SBruce Richardson } 254*99a2dd95SBruce Richardson 255*99a2dd95SBruce Richardson return elem == NULL ? NULL : (void *)(&elem[1]); 256*99a2dd95SBruce Richardson } 257*99a2dd95SBruce Richardson 258*99a2dd95SBruce Richardson static void * 259*99a2dd95SBruce Richardson heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused, 260*99a2dd95SBruce Richardson unsigned int flags, size_t align, bool contig) 261*99a2dd95SBruce Richardson { 262*99a2dd95SBruce Richardson struct malloc_elem *elem; 263*99a2dd95SBruce Richardson size_t size; 264*99a2dd95SBruce Richardson 265*99a2dd95SBruce Richardson align = RTE_CACHE_LINE_ROUNDUP(align); 266*99a2dd95SBruce Richardson 267*99a2dd95SBruce Richardson elem = find_biggest_element(heap, &size, flags, align, contig); 268*99a2dd95SBruce Richardson if (elem != NULL) { 269*99a2dd95SBruce Richardson elem = malloc_elem_alloc(elem, size, align, 0, contig); 270*99a2dd95SBruce Richardson 271*99a2dd95SBruce Richardson /* increase heap's count of allocated elements */ 272*99a2dd95SBruce Richardson heap->alloc_count++; 273*99a2dd95SBruce Richardson } 274*99a2dd95SBruce Richardson 275*99a2dd95SBruce Richardson return elem == NULL ? NULL : (void *)(&elem[1]); 276*99a2dd95SBruce Richardson } 277*99a2dd95SBruce Richardson 278*99a2dd95SBruce Richardson /* this function is exposed in malloc_mp.h */ 279*99a2dd95SBruce Richardson void 280*99a2dd95SBruce Richardson rollback_expand_heap(struct rte_memseg **ms, int n_segs, 281*99a2dd95SBruce Richardson struct malloc_elem *elem, void *map_addr, size_t map_len) 282*99a2dd95SBruce Richardson { 283*99a2dd95SBruce Richardson if (elem != NULL) { 284*99a2dd95SBruce Richardson malloc_elem_free_list_remove(elem); 285*99a2dd95SBruce Richardson malloc_elem_hide_region(elem, map_addr, map_len); 286*99a2dd95SBruce Richardson } 287*99a2dd95SBruce Richardson 288*99a2dd95SBruce Richardson eal_memalloc_free_seg_bulk(ms, n_segs); 289*99a2dd95SBruce Richardson } 290*99a2dd95SBruce Richardson 291*99a2dd95SBruce Richardson /* this function is exposed in malloc_mp.h */ 292*99a2dd95SBruce Richardson struct malloc_elem * 293*99a2dd95SBruce Richardson alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, 294*99a2dd95SBruce Richardson int socket, unsigned int flags, size_t align, size_t bound, 295*99a2dd95SBruce Richardson bool contig, struct rte_memseg **ms, int n_segs) 296*99a2dd95SBruce Richardson { 297*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 298*99a2dd95SBruce Richardson struct rte_memseg_list *msl; 299*99a2dd95SBruce Richardson struct malloc_elem *elem = NULL; 300*99a2dd95SBruce Richardson size_t alloc_sz; 301*99a2dd95SBruce Richardson int allocd_pages; 302*99a2dd95SBruce Richardson void *ret, *map_addr; 303*99a2dd95SBruce Richardson 304*99a2dd95SBruce Richardson alloc_sz = (size_t)pg_sz * n_segs; 305*99a2dd95SBruce Richardson 306*99a2dd95SBruce Richardson /* first, check if we're allowed to allocate this memory */ 307*99a2dd95SBruce Richardson if (eal_memalloc_mem_alloc_validate(socket, 308*99a2dd95SBruce Richardson heap->total_size + alloc_sz) < 0) { 309*99a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "User has disallowed allocation\n"); 310*99a2dd95SBruce Richardson return NULL; 311*99a2dd95SBruce Richardson } 312*99a2dd95SBruce Richardson 313*99a2dd95SBruce Richardson allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz, 314*99a2dd95SBruce Richardson socket, true); 315*99a2dd95SBruce Richardson 316*99a2dd95SBruce Richardson /* make sure we've allocated our pages... */ 317*99a2dd95SBruce Richardson if (allocd_pages < 0) 318*99a2dd95SBruce Richardson return NULL; 319*99a2dd95SBruce Richardson 320*99a2dd95SBruce Richardson map_addr = ms[0]->addr; 321*99a2dd95SBruce Richardson msl = rte_mem_virt2memseg_list(map_addr); 322*99a2dd95SBruce Richardson 323*99a2dd95SBruce Richardson /* check if we wanted contiguous memory but didn't get it */ 324*99a2dd95SBruce Richardson if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) { 325*99a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n", 326*99a2dd95SBruce Richardson __func__); 327*99a2dd95SBruce Richardson goto fail; 328*99a2dd95SBruce Richardson } 329*99a2dd95SBruce Richardson 330*99a2dd95SBruce Richardson /* 331*99a2dd95SBruce Richardson * Once we have all the memseg lists configured, if there is a dma mask 332*99a2dd95SBruce Richardson * set, check iova addresses are not out of range. Otherwise the device 333*99a2dd95SBruce Richardson * setting the dma mask could have problems with the mapped memory. 334*99a2dd95SBruce Richardson * 335*99a2dd95SBruce Richardson * There are two situations when this can happen: 336*99a2dd95SBruce Richardson * 1) memory initialization 337*99a2dd95SBruce Richardson * 2) dynamic memory allocation 338*99a2dd95SBruce Richardson * 339*99a2dd95SBruce Richardson * For 1), an error when checking dma mask implies app can not be 340*99a2dd95SBruce Richardson * executed. For 2) implies the new memory can not be added. 341*99a2dd95SBruce Richardson */ 342*99a2dd95SBruce Richardson if (mcfg->dma_maskbits && 343*99a2dd95SBruce Richardson rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { 344*99a2dd95SBruce Richardson /* 345*99a2dd95SBruce Richardson * Currently this can only happen if IOMMU is enabled 346*99a2dd95SBruce Richardson * and the address width supported by the IOMMU hw is 347*99a2dd95SBruce Richardson * not enough for using the memory mapped IOVAs. 348*99a2dd95SBruce Richardson * 349*99a2dd95SBruce Richardson * If IOVA is VA, advice to try with '--iova-mode pa' 350*99a2dd95SBruce Richardson * which could solve some situations when IOVA VA is not 351*99a2dd95SBruce Richardson * really needed. 352*99a2dd95SBruce Richardson */ 353*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, 354*99a2dd95SBruce Richardson "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask\n", 355*99a2dd95SBruce Richardson __func__); 356*99a2dd95SBruce Richardson 357*99a2dd95SBruce Richardson /* 358*99a2dd95SBruce Richardson * If IOVA is VA and it is possible to run with IOVA PA, 359*99a2dd95SBruce Richardson * because user is root, give and advice for solving the 360*99a2dd95SBruce Richardson * problem. 361*99a2dd95SBruce Richardson */ 362*99a2dd95SBruce Richardson if ((rte_eal_iova_mode() == RTE_IOVA_VA) && 363*99a2dd95SBruce Richardson rte_eal_using_phys_addrs()) 364*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, 365*99a2dd95SBruce Richardson "%s(): Please try initializing EAL with --iova-mode=pa parameter\n", 366*99a2dd95SBruce Richardson __func__); 367*99a2dd95SBruce Richardson goto fail; 368*99a2dd95SBruce Richardson } 369*99a2dd95SBruce Richardson 370*99a2dd95SBruce Richardson /* add newly minted memsegs to malloc heap */ 371*99a2dd95SBruce Richardson elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz); 372*99a2dd95SBruce Richardson 373*99a2dd95SBruce Richardson /* try once more, as now we have allocated new memory */ 374*99a2dd95SBruce Richardson ret = find_suitable_element(heap, elt_size, flags, align, bound, 375*99a2dd95SBruce Richardson contig); 376*99a2dd95SBruce Richardson 377*99a2dd95SBruce Richardson if (ret == NULL) 378*99a2dd95SBruce Richardson goto fail; 379*99a2dd95SBruce Richardson 380*99a2dd95SBruce Richardson return elem; 381*99a2dd95SBruce Richardson 382*99a2dd95SBruce Richardson fail: 383*99a2dd95SBruce Richardson rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); 384*99a2dd95SBruce Richardson return NULL; 385*99a2dd95SBruce Richardson } 386*99a2dd95SBruce Richardson 387*99a2dd95SBruce Richardson static int 388*99a2dd95SBruce Richardson try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, 389*99a2dd95SBruce Richardson size_t elt_size, int socket, unsigned int flags, size_t align, 390*99a2dd95SBruce Richardson size_t bound, bool contig) 391*99a2dd95SBruce Richardson { 392*99a2dd95SBruce Richardson struct malloc_elem *elem; 393*99a2dd95SBruce Richardson struct rte_memseg **ms; 394*99a2dd95SBruce Richardson void *map_addr; 395*99a2dd95SBruce Richardson size_t alloc_sz; 396*99a2dd95SBruce Richardson int n_segs; 397*99a2dd95SBruce Richardson bool callback_triggered = false; 398*99a2dd95SBruce Richardson 399*99a2dd95SBruce Richardson alloc_sz = RTE_ALIGN_CEIL(align + elt_size + 400*99a2dd95SBruce Richardson MALLOC_ELEM_TRAILER_LEN, pg_sz); 401*99a2dd95SBruce Richardson n_segs = alloc_sz / pg_sz; 402*99a2dd95SBruce Richardson 403*99a2dd95SBruce Richardson /* we can't know in advance how many pages we'll need, so we malloc */ 404*99a2dd95SBruce Richardson ms = malloc(sizeof(*ms) * n_segs); 405*99a2dd95SBruce Richardson if (ms == NULL) 406*99a2dd95SBruce Richardson return -1; 407*99a2dd95SBruce Richardson memset(ms, 0, sizeof(*ms) * n_segs); 408*99a2dd95SBruce Richardson 409*99a2dd95SBruce Richardson elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, 410*99a2dd95SBruce Richardson bound, contig, ms, n_segs); 411*99a2dd95SBruce Richardson 412*99a2dd95SBruce Richardson if (elem == NULL) 413*99a2dd95SBruce Richardson goto free_ms; 414*99a2dd95SBruce Richardson 415*99a2dd95SBruce Richardson map_addr = ms[0]->addr; 416*99a2dd95SBruce Richardson 417*99a2dd95SBruce Richardson /* notify user about changes in memory map */ 418*99a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); 419*99a2dd95SBruce Richardson 420*99a2dd95SBruce Richardson /* notify other processes that this has happened */ 421*99a2dd95SBruce Richardson if (request_sync()) { 422*99a2dd95SBruce Richardson /* we couldn't ensure all processes have mapped memory, 423*99a2dd95SBruce Richardson * so free it back and notify everyone that it's been 424*99a2dd95SBruce Richardson * freed back. 425*99a2dd95SBruce Richardson * 426*99a2dd95SBruce Richardson * technically, we could've avoided adding memory addresses to 427*99a2dd95SBruce Richardson * the map, but that would've led to inconsistent behavior 428*99a2dd95SBruce Richardson * between primary and secondary processes, as those get 429*99a2dd95SBruce Richardson * callbacks during sync. therefore, force primary process to 430*99a2dd95SBruce Richardson * do alloc-and-rollback syncs as well. 431*99a2dd95SBruce Richardson */ 432*99a2dd95SBruce Richardson callback_triggered = true; 433*99a2dd95SBruce Richardson goto free_elem; 434*99a2dd95SBruce Richardson } 435*99a2dd95SBruce Richardson heap->total_size += alloc_sz; 436*99a2dd95SBruce Richardson 437*99a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n", 438*99a2dd95SBruce Richardson socket, alloc_sz >> 20ULL); 439*99a2dd95SBruce Richardson 440*99a2dd95SBruce Richardson free(ms); 441*99a2dd95SBruce Richardson 442*99a2dd95SBruce Richardson return 0; 443*99a2dd95SBruce Richardson 444*99a2dd95SBruce Richardson free_elem: 445*99a2dd95SBruce Richardson if (callback_triggered) 446*99a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 447*99a2dd95SBruce Richardson map_addr, alloc_sz); 448*99a2dd95SBruce Richardson 449*99a2dd95SBruce Richardson rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); 450*99a2dd95SBruce Richardson 451*99a2dd95SBruce Richardson request_sync(); 452*99a2dd95SBruce Richardson free_ms: 453*99a2dd95SBruce Richardson free(ms); 454*99a2dd95SBruce Richardson 455*99a2dd95SBruce Richardson return -1; 456*99a2dd95SBruce Richardson } 457*99a2dd95SBruce Richardson 458*99a2dd95SBruce Richardson static int 459*99a2dd95SBruce Richardson try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz, 460*99a2dd95SBruce Richardson size_t elt_size, int socket, unsigned int flags, size_t align, 461*99a2dd95SBruce Richardson size_t bound, bool contig) 462*99a2dd95SBruce Richardson { 463*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 464*99a2dd95SBruce Richardson struct malloc_mp_req req; 465*99a2dd95SBruce Richardson int req_result; 466*99a2dd95SBruce Richardson 467*99a2dd95SBruce Richardson memset(&req, 0, sizeof(req)); 468*99a2dd95SBruce Richardson 469*99a2dd95SBruce Richardson req.t = REQ_TYPE_ALLOC; 470*99a2dd95SBruce Richardson req.alloc_req.align = align; 471*99a2dd95SBruce Richardson req.alloc_req.bound = bound; 472*99a2dd95SBruce Richardson req.alloc_req.contig = contig; 473*99a2dd95SBruce Richardson req.alloc_req.flags = flags; 474*99a2dd95SBruce Richardson req.alloc_req.elt_size = elt_size; 475*99a2dd95SBruce Richardson req.alloc_req.page_sz = pg_sz; 476*99a2dd95SBruce Richardson req.alloc_req.socket = socket; 477*99a2dd95SBruce Richardson req.alloc_req.malloc_heap_idx = heap - mcfg->malloc_heaps; 478*99a2dd95SBruce Richardson 479*99a2dd95SBruce Richardson req_result = request_to_primary(&req); 480*99a2dd95SBruce Richardson 481*99a2dd95SBruce Richardson if (req_result != 0) 482*99a2dd95SBruce Richardson return -1; 483*99a2dd95SBruce Richardson 484*99a2dd95SBruce Richardson if (req.result != REQ_RESULT_SUCCESS) 485*99a2dd95SBruce Richardson return -1; 486*99a2dd95SBruce Richardson 487*99a2dd95SBruce Richardson return 0; 488*99a2dd95SBruce Richardson } 489*99a2dd95SBruce Richardson 490*99a2dd95SBruce Richardson static int 491*99a2dd95SBruce Richardson try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, 492*99a2dd95SBruce Richardson int socket, unsigned int flags, size_t align, size_t bound, 493*99a2dd95SBruce Richardson bool contig) 494*99a2dd95SBruce Richardson { 495*99a2dd95SBruce Richardson int ret; 496*99a2dd95SBruce Richardson 497*99a2dd95SBruce Richardson rte_mcfg_mem_write_lock(); 498*99a2dd95SBruce Richardson 499*99a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 500*99a2dd95SBruce Richardson ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket, 501*99a2dd95SBruce Richardson flags, align, bound, contig); 502*99a2dd95SBruce Richardson } else { 503*99a2dd95SBruce Richardson ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket, 504*99a2dd95SBruce Richardson flags, align, bound, contig); 505*99a2dd95SBruce Richardson } 506*99a2dd95SBruce Richardson 507*99a2dd95SBruce Richardson rte_mcfg_mem_write_unlock(); 508*99a2dd95SBruce Richardson return ret; 509*99a2dd95SBruce Richardson } 510*99a2dd95SBruce Richardson 511*99a2dd95SBruce Richardson static int 512*99a2dd95SBruce Richardson compare_pagesz(const void *a, const void *b) 513*99a2dd95SBruce Richardson { 514*99a2dd95SBruce Richardson const struct rte_memseg_list * const*mpa = a; 515*99a2dd95SBruce Richardson const struct rte_memseg_list * const*mpb = b; 516*99a2dd95SBruce Richardson const struct rte_memseg_list *msla = *mpa; 517*99a2dd95SBruce Richardson const struct rte_memseg_list *mslb = *mpb; 518*99a2dd95SBruce Richardson uint64_t pg_sz_a = msla->page_sz; 519*99a2dd95SBruce Richardson uint64_t pg_sz_b = mslb->page_sz; 520*99a2dd95SBruce Richardson 521*99a2dd95SBruce Richardson if (pg_sz_a < pg_sz_b) 522*99a2dd95SBruce Richardson return -1; 523*99a2dd95SBruce Richardson if (pg_sz_a > pg_sz_b) 524*99a2dd95SBruce Richardson return 1; 525*99a2dd95SBruce Richardson return 0; 526*99a2dd95SBruce Richardson } 527*99a2dd95SBruce Richardson 528*99a2dd95SBruce Richardson static int 529*99a2dd95SBruce Richardson alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, 530*99a2dd95SBruce Richardson unsigned int flags, size_t align, size_t bound, bool contig) 531*99a2dd95SBruce Richardson { 532*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 533*99a2dd95SBruce Richardson struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS]; 534*99a2dd95SBruce Richardson struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS]; 535*99a2dd95SBruce Richardson uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS]; 536*99a2dd95SBruce Richardson uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS]; 537*99a2dd95SBruce Richardson uint64_t prev_pg_sz; 538*99a2dd95SBruce Richardson int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz; 539*99a2dd95SBruce Richardson bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0; 540*99a2dd95SBruce Richardson unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; 541*99a2dd95SBruce Richardson void *ret; 542*99a2dd95SBruce Richardson 543*99a2dd95SBruce Richardson memset(requested_msls, 0, sizeof(requested_msls)); 544*99a2dd95SBruce Richardson memset(other_msls, 0, sizeof(other_msls)); 545*99a2dd95SBruce Richardson memset(requested_pg_sz, 0, sizeof(requested_pg_sz)); 546*99a2dd95SBruce Richardson memset(other_pg_sz, 0, sizeof(other_pg_sz)); 547*99a2dd95SBruce Richardson 548*99a2dd95SBruce Richardson /* 549*99a2dd95SBruce Richardson * go through memseg list and take note of all the page sizes available, 550*99a2dd95SBruce Richardson * and if any of them were specifically requested by the user. 551*99a2dd95SBruce Richardson */ 552*99a2dd95SBruce Richardson n_requested_msls = 0; 553*99a2dd95SBruce Richardson n_other_msls = 0; 554*99a2dd95SBruce Richardson for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 555*99a2dd95SBruce Richardson struct rte_memseg_list *msl = &mcfg->memsegs[i]; 556*99a2dd95SBruce Richardson 557*99a2dd95SBruce Richardson if (msl->socket_id != socket) 558*99a2dd95SBruce Richardson continue; 559*99a2dd95SBruce Richardson 560*99a2dd95SBruce Richardson if (msl->base_va == NULL) 561*99a2dd95SBruce Richardson continue; 562*99a2dd95SBruce Richardson 563*99a2dd95SBruce Richardson /* if pages of specific size were requested */ 564*99a2dd95SBruce Richardson if (size_flags != 0 && check_hugepage_sz(size_flags, 565*99a2dd95SBruce Richardson msl->page_sz)) 566*99a2dd95SBruce Richardson requested_msls[n_requested_msls++] = msl; 567*99a2dd95SBruce Richardson else if (size_flags == 0 || size_hint) 568*99a2dd95SBruce Richardson other_msls[n_other_msls++] = msl; 569*99a2dd95SBruce Richardson } 570*99a2dd95SBruce Richardson 571*99a2dd95SBruce Richardson /* sort the lists, smallest first */ 572*99a2dd95SBruce Richardson qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]), 573*99a2dd95SBruce Richardson compare_pagesz); 574*99a2dd95SBruce Richardson qsort(other_msls, n_other_msls, sizeof(other_msls[0]), 575*99a2dd95SBruce Richardson compare_pagesz); 576*99a2dd95SBruce Richardson 577*99a2dd95SBruce Richardson /* now, extract page sizes we are supposed to try */ 578*99a2dd95SBruce Richardson prev_pg_sz = 0; 579*99a2dd95SBruce Richardson n_requested_pg_sz = 0; 580*99a2dd95SBruce Richardson for (i = 0; i < n_requested_msls; i++) { 581*99a2dd95SBruce Richardson uint64_t pg_sz = requested_msls[i]->page_sz; 582*99a2dd95SBruce Richardson 583*99a2dd95SBruce Richardson if (prev_pg_sz != pg_sz) { 584*99a2dd95SBruce Richardson requested_pg_sz[n_requested_pg_sz++] = pg_sz; 585*99a2dd95SBruce Richardson prev_pg_sz = pg_sz; 586*99a2dd95SBruce Richardson } 587*99a2dd95SBruce Richardson } 588*99a2dd95SBruce Richardson prev_pg_sz = 0; 589*99a2dd95SBruce Richardson n_other_pg_sz = 0; 590*99a2dd95SBruce Richardson for (i = 0; i < n_other_msls; i++) { 591*99a2dd95SBruce Richardson uint64_t pg_sz = other_msls[i]->page_sz; 592*99a2dd95SBruce Richardson 593*99a2dd95SBruce Richardson if (prev_pg_sz != pg_sz) { 594*99a2dd95SBruce Richardson other_pg_sz[n_other_pg_sz++] = pg_sz; 595*99a2dd95SBruce Richardson prev_pg_sz = pg_sz; 596*99a2dd95SBruce Richardson } 597*99a2dd95SBruce Richardson } 598*99a2dd95SBruce Richardson 599*99a2dd95SBruce Richardson /* finally, try allocating memory of specified page sizes, starting from 600*99a2dd95SBruce Richardson * the smallest sizes 601*99a2dd95SBruce Richardson */ 602*99a2dd95SBruce Richardson for (i = 0; i < n_requested_pg_sz; i++) { 603*99a2dd95SBruce Richardson uint64_t pg_sz = requested_pg_sz[i]; 604*99a2dd95SBruce Richardson 605*99a2dd95SBruce Richardson /* 606*99a2dd95SBruce Richardson * do not pass the size hint here, as user expects other page 607*99a2dd95SBruce Richardson * sizes first, before resorting to best effort allocation. 608*99a2dd95SBruce Richardson */ 609*99a2dd95SBruce Richardson if (!try_expand_heap(heap, pg_sz, size, socket, size_flags, 610*99a2dd95SBruce Richardson align, bound, contig)) 611*99a2dd95SBruce Richardson return 0; 612*99a2dd95SBruce Richardson } 613*99a2dd95SBruce Richardson if (n_other_pg_sz == 0) 614*99a2dd95SBruce Richardson return -1; 615*99a2dd95SBruce Richardson 616*99a2dd95SBruce Richardson /* now, check if we can reserve anything with size hint */ 617*99a2dd95SBruce Richardson ret = find_suitable_element(heap, size, flags, align, bound, contig); 618*99a2dd95SBruce Richardson if (ret != NULL) 619*99a2dd95SBruce Richardson return 0; 620*99a2dd95SBruce Richardson 621*99a2dd95SBruce Richardson /* 622*99a2dd95SBruce Richardson * we still couldn't reserve memory, so try expanding heap with other 623*99a2dd95SBruce Richardson * page sizes, if there are any 624*99a2dd95SBruce Richardson */ 625*99a2dd95SBruce Richardson for (i = 0; i < n_other_pg_sz; i++) { 626*99a2dd95SBruce Richardson uint64_t pg_sz = other_pg_sz[i]; 627*99a2dd95SBruce Richardson 628*99a2dd95SBruce Richardson if (!try_expand_heap(heap, pg_sz, size, socket, flags, 629*99a2dd95SBruce Richardson align, bound, contig)) 630*99a2dd95SBruce Richardson return 0; 631*99a2dd95SBruce Richardson } 632*99a2dd95SBruce Richardson return -1; 633*99a2dd95SBruce Richardson } 634*99a2dd95SBruce Richardson 635*99a2dd95SBruce Richardson /* this will try lower page sizes first */ 636*99a2dd95SBruce Richardson static void * 637*99a2dd95SBruce Richardson malloc_heap_alloc_on_heap_id(const char *type, size_t size, 638*99a2dd95SBruce Richardson unsigned int heap_id, unsigned int flags, size_t align, 639*99a2dd95SBruce Richardson size_t bound, bool contig) 640*99a2dd95SBruce Richardson { 641*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 642*99a2dd95SBruce Richardson struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; 643*99a2dd95SBruce Richardson unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; 644*99a2dd95SBruce Richardson int socket_id; 645*99a2dd95SBruce Richardson void *ret; 646*99a2dd95SBruce Richardson const struct internal_config *internal_conf = 647*99a2dd95SBruce Richardson eal_get_internal_configuration(); 648*99a2dd95SBruce Richardson 649*99a2dd95SBruce Richardson rte_spinlock_lock(&(heap->lock)); 650*99a2dd95SBruce Richardson 651*99a2dd95SBruce Richardson align = align == 0 ? 1 : align; 652*99a2dd95SBruce Richardson 653*99a2dd95SBruce Richardson /* for legacy mode, try once and with all flags */ 654*99a2dd95SBruce Richardson if (internal_conf->legacy_mem) { 655*99a2dd95SBruce Richardson ret = heap_alloc(heap, type, size, flags, align, bound, contig); 656*99a2dd95SBruce Richardson goto alloc_unlock; 657*99a2dd95SBruce Richardson } 658*99a2dd95SBruce Richardson 659*99a2dd95SBruce Richardson /* 660*99a2dd95SBruce Richardson * we do not pass the size hint here, because even if allocation fails, 661*99a2dd95SBruce Richardson * we may still be able to allocate memory from appropriate page sizes, 662*99a2dd95SBruce Richardson * we just need to request more memory first. 663*99a2dd95SBruce Richardson */ 664*99a2dd95SBruce Richardson 665*99a2dd95SBruce Richardson socket_id = rte_socket_id_by_idx(heap_id); 666*99a2dd95SBruce Richardson /* 667*99a2dd95SBruce Richardson * if socket ID is negative, we cannot find a socket ID for this heap - 668*99a2dd95SBruce Richardson * which means it's an external heap. those can have unexpected page 669*99a2dd95SBruce Richardson * sizes, so if the user asked to allocate from there - assume user 670*99a2dd95SBruce Richardson * knows what they're doing, and allow allocating from there with any 671*99a2dd95SBruce Richardson * page size flags. 672*99a2dd95SBruce Richardson */ 673*99a2dd95SBruce Richardson if (socket_id < 0) 674*99a2dd95SBruce Richardson size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY; 675*99a2dd95SBruce Richardson 676*99a2dd95SBruce Richardson ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); 677*99a2dd95SBruce Richardson if (ret != NULL) 678*99a2dd95SBruce Richardson goto alloc_unlock; 679*99a2dd95SBruce Richardson 680*99a2dd95SBruce Richardson /* if socket ID is invalid, this is an external heap */ 681*99a2dd95SBruce Richardson if (socket_id < 0) 682*99a2dd95SBruce Richardson goto alloc_unlock; 683*99a2dd95SBruce Richardson 684*99a2dd95SBruce Richardson if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align, 685*99a2dd95SBruce Richardson bound, contig)) { 686*99a2dd95SBruce Richardson ret = heap_alloc(heap, type, size, flags, align, bound, contig); 687*99a2dd95SBruce Richardson 688*99a2dd95SBruce Richardson /* this should have succeeded */ 689*99a2dd95SBruce Richardson if (ret == NULL) 690*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Error allocating from heap\n"); 691*99a2dd95SBruce Richardson } 692*99a2dd95SBruce Richardson alloc_unlock: 693*99a2dd95SBruce Richardson rte_spinlock_unlock(&(heap->lock)); 694*99a2dd95SBruce Richardson return ret; 695*99a2dd95SBruce Richardson } 696*99a2dd95SBruce Richardson 697*99a2dd95SBruce Richardson void * 698*99a2dd95SBruce Richardson malloc_heap_alloc(const char *type, size_t size, int socket_arg, 699*99a2dd95SBruce Richardson unsigned int flags, size_t align, size_t bound, bool contig) 700*99a2dd95SBruce Richardson { 701*99a2dd95SBruce Richardson int socket, heap_id, i; 702*99a2dd95SBruce Richardson void *ret; 703*99a2dd95SBruce Richardson 704*99a2dd95SBruce Richardson /* return NULL if size is 0 or alignment is not power-of-2 */ 705*99a2dd95SBruce Richardson if (size == 0 || (align && !rte_is_power_of_2(align))) 706*99a2dd95SBruce Richardson return NULL; 707*99a2dd95SBruce Richardson 708*99a2dd95SBruce Richardson if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES) 709*99a2dd95SBruce Richardson socket_arg = SOCKET_ID_ANY; 710*99a2dd95SBruce Richardson 711*99a2dd95SBruce Richardson if (socket_arg == SOCKET_ID_ANY) 712*99a2dd95SBruce Richardson socket = malloc_get_numa_socket(); 713*99a2dd95SBruce Richardson else 714*99a2dd95SBruce Richardson socket = socket_arg; 715*99a2dd95SBruce Richardson 716*99a2dd95SBruce Richardson /* turn socket ID into heap ID */ 717*99a2dd95SBruce Richardson heap_id = malloc_socket_to_heap_id(socket); 718*99a2dd95SBruce Richardson /* if heap id is negative, socket ID was invalid */ 719*99a2dd95SBruce Richardson if (heap_id < 0) 720*99a2dd95SBruce Richardson return NULL; 721*99a2dd95SBruce Richardson 722*99a2dd95SBruce Richardson ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align, 723*99a2dd95SBruce Richardson bound, contig); 724*99a2dd95SBruce Richardson if (ret != NULL || socket_arg != SOCKET_ID_ANY) 725*99a2dd95SBruce Richardson return ret; 726*99a2dd95SBruce Richardson 727*99a2dd95SBruce Richardson /* try other heaps. we are only iterating through native DPDK sockets, 728*99a2dd95SBruce Richardson * so external heaps won't be included. 729*99a2dd95SBruce Richardson */ 730*99a2dd95SBruce Richardson for (i = 0; i < (int) rte_socket_count(); i++) { 731*99a2dd95SBruce Richardson if (i == heap_id) 732*99a2dd95SBruce Richardson continue; 733*99a2dd95SBruce Richardson ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align, 734*99a2dd95SBruce Richardson bound, contig); 735*99a2dd95SBruce Richardson if (ret != NULL) 736*99a2dd95SBruce Richardson return ret; 737*99a2dd95SBruce Richardson } 738*99a2dd95SBruce Richardson return NULL; 739*99a2dd95SBruce Richardson } 740*99a2dd95SBruce Richardson 741*99a2dd95SBruce Richardson static void * 742*99a2dd95SBruce Richardson heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id, 743*99a2dd95SBruce Richardson unsigned int flags, size_t align, bool contig) 744*99a2dd95SBruce Richardson { 745*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 746*99a2dd95SBruce Richardson struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; 747*99a2dd95SBruce Richardson void *ret; 748*99a2dd95SBruce Richardson 749*99a2dd95SBruce Richardson rte_spinlock_lock(&(heap->lock)); 750*99a2dd95SBruce Richardson 751*99a2dd95SBruce Richardson align = align == 0 ? 1 : align; 752*99a2dd95SBruce Richardson 753*99a2dd95SBruce Richardson ret = heap_alloc_biggest(heap, type, flags, align, contig); 754*99a2dd95SBruce Richardson 755*99a2dd95SBruce Richardson rte_spinlock_unlock(&(heap->lock)); 756*99a2dd95SBruce Richardson 757*99a2dd95SBruce Richardson return ret; 758*99a2dd95SBruce Richardson } 759*99a2dd95SBruce Richardson 760*99a2dd95SBruce Richardson void * 761*99a2dd95SBruce Richardson malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, 762*99a2dd95SBruce Richardson size_t align, bool contig) 763*99a2dd95SBruce Richardson { 764*99a2dd95SBruce Richardson int socket, i, cur_socket, heap_id; 765*99a2dd95SBruce Richardson void *ret; 766*99a2dd95SBruce Richardson 767*99a2dd95SBruce Richardson /* return NULL if align is not power-of-2 */ 768*99a2dd95SBruce Richardson if ((align && !rte_is_power_of_2(align))) 769*99a2dd95SBruce Richardson return NULL; 770*99a2dd95SBruce Richardson 771*99a2dd95SBruce Richardson if (!rte_eal_has_hugepages()) 772*99a2dd95SBruce Richardson socket_arg = SOCKET_ID_ANY; 773*99a2dd95SBruce Richardson 774*99a2dd95SBruce Richardson if (socket_arg == SOCKET_ID_ANY) 775*99a2dd95SBruce Richardson socket = malloc_get_numa_socket(); 776*99a2dd95SBruce Richardson else 777*99a2dd95SBruce Richardson socket = socket_arg; 778*99a2dd95SBruce Richardson 779*99a2dd95SBruce Richardson /* turn socket ID into heap ID */ 780*99a2dd95SBruce Richardson heap_id = malloc_socket_to_heap_id(socket); 781*99a2dd95SBruce Richardson /* if heap id is negative, socket ID was invalid */ 782*99a2dd95SBruce Richardson if (heap_id < 0) 783*99a2dd95SBruce Richardson return NULL; 784*99a2dd95SBruce Richardson 785*99a2dd95SBruce Richardson ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align, 786*99a2dd95SBruce Richardson contig); 787*99a2dd95SBruce Richardson if (ret != NULL || socket_arg != SOCKET_ID_ANY) 788*99a2dd95SBruce Richardson return ret; 789*99a2dd95SBruce Richardson 790*99a2dd95SBruce Richardson /* try other heaps */ 791*99a2dd95SBruce Richardson for (i = 0; i < (int) rte_socket_count(); i++) { 792*99a2dd95SBruce Richardson cur_socket = rte_socket_id_by_idx(i); 793*99a2dd95SBruce Richardson if (cur_socket == socket) 794*99a2dd95SBruce Richardson continue; 795*99a2dd95SBruce Richardson ret = heap_alloc_biggest_on_heap_id(type, i, flags, align, 796*99a2dd95SBruce Richardson contig); 797*99a2dd95SBruce Richardson if (ret != NULL) 798*99a2dd95SBruce Richardson return ret; 799*99a2dd95SBruce Richardson } 800*99a2dd95SBruce Richardson return NULL; 801*99a2dd95SBruce Richardson } 802*99a2dd95SBruce Richardson 803*99a2dd95SBruce Richardson /* this function is exposed in malloc_mp.h */ 804*99a2dd95SBruce Richardson int 805*99a2dd95SBruce Richardson malloc_heap_free_pages(void *aligned_start, size_t aligned_len) 806*99a2dd95SBruce Richardson { 807*99a2dd95SBruce Richardson int n_segs, seg_idx, max_seg_idx; 808*99a2dd95SBruce Richardson struct rte_memseg_list *msl; 809*99a2dd95SBruce Richardson size_t page_sz; 810*99a2dd95SBruce Richardson 811*99a2dd95SBruce Richardson msl = rte_mem_virt2memseg_list(aligned_start); 812*99a2dd95SBruce Richardson if (msl == NULL) 813*99a2dd95SBruce Richardson return -1; 814*99a2dd95SBruce Richardson 815*99a2dd95SBruce Richardson page_sz = (size_t)msl->page_sz; 816*99a2dd95SBruce Richardson n_segs = aligned_len / page_sz; 817*99a2dd95SBruce Richardson seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz; 818*99a2dd95SBruce Richardson max_seg_idx = seg_idx + n_segs; 819*99a2dd95SBruce Richardson 820*99a2dd95SBruce Richardson for (; seg_idx < max_seg_idx; seg_idx++) { 821*99a2dd95SBruce Richardson struct rte_memseg *ms; 822*99a2dd95SBruce Richardson 823*99a2dd95SBruce Richardson ms = rte_fbarray_get(&msl->memseg_arr, seg_idx); 824*99a2dd95SBruce Richardson eal_memalloc_free_seg(ms); 825*99a2dd95SBruce Richardson } 826*99a2dd95SBruce Richardson return 0; 827*99a2dd95SBruce Richardson } 828*99a2dd95SBruce Richardson 829*99a2dd95SBruce Richardson int 830*99a2dd95SBruce Richardson malloc_heap_free(struct malloc_elem *elem) 831*99a2dd95SBruce Richardson { 832*99a2dd95SBruce Richardson struct malloc_heap *heap; 833*99a2dd95SBruce Richardson void *start, *aligned_start, *end, *aligned_end; 834*99a2dd95SBruce Richardson size_t len, aligned_len, page_sz; 835*99a2dd95SBruce Richardson struct rte_memseg_list *msl; 836*99a2dd95SBruce Richardson unsigned int i, n_segs, before_space, after_space; 837*99a2dd95SBruce Richardson int ret; 838*99a2dd95SBruce Richardson const struct internal_config *internal_conf = 839*99a2dd95SBruce Richardson eal_get_internal_configuration(); 840*99a2dd95SBruce Richardson 841*99a2dd95SBruce Richardson if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) 842*99a2dd95SBruce Richardson return -1; 843*99a2dd95SBruce Richardson 844*99a2dd95SBruce Richardson /* elem may be merged with previous element, so keep heap address */ 845*99a2dd95SBruce Richardson heap = elem->heap; 846*99a2dd95SBruce Richardson msl = elem->msl; 847*99a2dd95SBruce Richardson page_sz = (size_t)msl->page_sz; 848*99a2dd95SBruce Richardson 849*99a2dd95SBruce Richardson rte_spinlock_lock(&(heap->lock)); 850*99a2dd95SBruce Richardson 851*99a2dd95SBruce Richardson /* mark element as free */ 852*99a2dd95SBruce Richardson elem->state = ELEM_FREE; 853*99a2dd95SBruce Richardson 854*99a2dd95SBruce Richardson elem = malloc_elem_free(elem); 855*99a2dd95SBruce Richardson 856*99a2dd95SBruce Richardson /* anything after this is a bonus */ 857*99a2dd95SBruce Richardson ret = 0; 858*99a2dd95SBruce Richardson 859*99a2dd95SBruce Richardson /* ...of which we can't avail if we are in legacy mode, or if this is an 860*99a2dd95SBruce Richardson * externally allocated segment. 861*99a2dd95SBruce Richardson */ 862*99a2dd95SBruce Richardson if (internal_conf->legacy_mem || (msl->external > 0)) 863*99a2dd95SBruce Richardson goto free_unlock; 864*99a2dd95SBruce Richardson 865*99a2dd95SBruce Richardson /* check if we can free any memory back to the system */ 866*99a2dd95SBruce Richardson if (elem->size < page_sz) 867*99a2dd95SBruce Richardson goto free_unlock; 868*99a2dd95SBruce Richardson 869*99a2dd95SBruce Richardson /* if user requested to match allocations, the sizes must match - if not, 870*99a2dd95SBruce Richardson * we will defer freeing these hugepages until the entire original allocation 871*99a2dd95SBruce Richardson * can be freed 872*99a2dd95SBruce Richardson */ 873*99a2dd95SBruce Richardson if (internal_conf->match_allocations && elem->size != elem->orig_size) 874*99a2dd95SBruce Richardson goto free_unlock; 875*99a2dd95SBruce Richardson 876*99a2dd95SBruce Richardson /* probably, but let's make sure, as we may not be using up full page */ 877*99a2dd95SBruce Richardson start = elem; 878*99a2dd95SBruce Richardson len = elem->size; 879*99a2dd95SBruce Richardson aligned_start = RTE_PTR_ALIGN_CEIL(start, page_sz); 880*99a2dd95SBruce Richardson end = RTE_PTR_ADD(elem, len); 881*99a2dd95SBruce Richardson aligned_end = RTE_PTR_ALIGN_FLOOR(end, page_sz); 882*99a2dd95SBruce Richardson 883*99a2dd95SBruce Richardson aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); 884*99a2dd95SBruce Richardson 885*99a2dd95SBruce Richardson /* can't free anything */ 886*99a2dd95SBruce Richardson if (aligned_len < page_sz) 887*99a2dd95SBruce Richardson goto free_unlock; 888*99a2dd95SBruce Richardson 889*99a2dd95SBruce Richardson /* we can free something. however, some of these pages may be marked as 890*99a2dd95SBruce Richardson * unfreeable, so also check that as well 891*99a2dd95SBruce Richardson */ 892*99a2dd95SBruce Richardson n_segs = aligned_len / page_sz; 893*99a2dd95SBruce Richardson for (i = 0; i < n_segs; i++) { 894*99a2dd95SBruce Richardson const struct rte_memseg *tmp = 895*99a2dd95SBruce Richardson rte_mem_virt2memseg(aligned_start, msl); 896*99a2dd95SBruce Richardson 897*99a2dd95SBruce Richardson if (tmp->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { 898*99a2dd95SBruce Richardson /* this is an unfreeable segment, so move start */ 899*99a2dd95SBruce Richardson aligned_start = RTE_PTR_ADD(tmp->addr, tmp->len); 900*99a2dd95SBruce Richardson } 901*99a2dd95SBruce Richardson } 902*99a2dd95SBruce Richardson 903*99a2dd95SBruce Richardson /* recalculate length and number of segments */ 904*99a2dd95SBruce Richardson aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); 905*99a2dd95SBruce Richardson n_segs = aligned_len / page_sz; 906*99a2dd95SBruce Richardson 907*99a2dd95SBruce Richardson /* check if we can still free some pages */ 908*99a2dd95SBruce Richardson if (n_segs == 0) 909*99a2dd95SBruce Richardson goto free_unlock; 910*99a2dd95SBruce Richardson 911*99a2dd95SBruce Richardson /* We're not done yet. We also have to check if by freeing space we will 912*99a2dd95SBruce Richardson * be leaving free elements that are too small to store new elements. 913*99a2dd95SBruce Richardson * Check if we have enough space in the beginning and at the end, or if 914*99a2dd95SBruce Richardson * start/end are exactly page aligned. 915*99a2dd95SBruce Richardson */ 916*99a2dd95SBruce Richardson before_space = RTE_PTR_DIFF(aligned_start, elem); 917*99a2dd95SBruce Richardson after_space = RTE_PTR_DIFF(end, aligned_end); 918*99a2dd95SBruce Richardson if (before_space != 0 && 919*99a2dd95SBruce Richardson before_space < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { 920*99a2dd95SBruce Richardson /* There is not enough space before start, but we may be able to 921*99a2dd95SBruce Richardson * move the start forward by one page. 922*99a2dd95SBruce Richardson */ 923*99a2dd95SBruce Richardson if (n_segs == 1) 924*99a2dd95SBruce Richardson goto free_unlock; 925*99a2dd95SBruce Richardson 926*99a2dd95SBruce Richardson /* move start */ 927*99a2dd95SBruce Richardson aligned_start = RTE_PTR_ADD(aligned_start, page_sz); 928*99a2dd95SBruce Richardson aligned_len -= page_sz; 929*99a2dd95SBruce Richardson n_segs--; 930*99a2dd95SBruce Richardson } 931*99a2dd95SBruce Richardson if (after_space != 0 && after_space < 932*99a2dd95SBruce Richardson MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { 933*99a2dd95SBruce Richardson /* There is not enough space after end, but we may be able to 934*99a2dd95SBruce Richardson * move the end backwards by one page. 935*99a2dd95SBruce Richardson */ 936*99a2dd95SBruce Richardson if (n_segs == 1) 937*99a2dd95SBruce Richardson goto free_unlock; 938*99a2dd95SBruce Richardson 939*99a2dd95SBruce Richardson /* move end */ 940*99a2dd95SBruce Richardson aligned_end = RTE_PTR_SUB(aligned_end, page_sz); 941*99a2dd95SBruce Richardson aligned_len -= page_sz; 942*99a2dd95SBruce Richardson n_segs--; 943*99a2dd95SBruce Richardson } 944*99a2dd95SBruce Richardson 945*99a2dd95SBruce Richardson /* now we can finally free us some pages */ 946*99a2dd95SBruce Richardson 947*99a2dd95SBruce Richardson rte_mcfg_mem_write_lock(); 948*99a2dd95SBruce Richardson 949*99a2dd95SBruce Richardson /* 950*99a2dd95SBruce Richardson * we allow secondary processes to clear the heap of this allocated 951*99a2dd95SBruce Richardson * memory because it is safe to do so, as even if notifications about 952*99a2dd95SBruce Richardson * unmapped pages don't make it to other processes, heap is shared 953*99a2dd95SBruce Richardson * across all processes, and will become empty of this memory anyway, 954*99a2dd95SBruce Richardson * and nothing can allocate it back unless primary process will be able 955*99a2dd95SBruce Richardson * to deliver allocation message to every single running process. 956*99a2dd95SBruce Richardson */ 957*99a2dd95SBruce Richardson 958*99a2dd95SBruce Richardson malloc_elem_free_list_remove(elem); 959*99a2dd95SBruce Richardson 960*99a2dd95SBruce Richardson malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len); 961*99a2dd95SBruce Richardson 962*99a2dd95SBruce Richardson heap->total_size -= aligned_len; 963*99a2dd95SBruce Richardson 964*99a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 965*99a2dd95SBruce Richardson /* notify user about changes in memory map */ 966*99a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 967*99a2dd95SBruce Richardson aligned_start, aligned_len); 968*99a2dd95SBruce Richardson 969*99a2dd95SBruce Richardson /* don't care if any of this fails */ 970*99a2dd95SBruce Richardson malloc_heap_free_pages(aligned_start, aligned_len); 971*99a2dd95SBruce Richardson 972*99a2dd95SBruce Richardson request_sync(); 973*99a2dd95SBruce Richardson } else { 974*99a2dd95SBruce Richardson struct malloc_mp_req req; 975*99a2dd95SBruce Richardson 976*99a2dd95SBruce Richardson memset(&req, 0, sizeof(req)); 977*99a2dd95SBruce Richardson 978*99a2dd95SBruce Richardson req.t = REQ_TYPE_FREE; 979*99a2dd95SBruce Richardson req.free_req.addr = aligned_start; 980*99a2dd95SBruce Richardson req.free_req.len = aligned_len; 981*99a2dd95SBruce Richardson 982*99a2dd95SBruce Richardson /* 983*99a2dd95SBruce Richardson * we request primary to deallocate pages, but we don't do it 984*99a2dd95SBruce Richardson * in this thread. instead, we notify primary that we would like 985*99a2dd95SBruce Richardson * to deallocate pages, and this process will receive another 986*99a2dd95SBruce Richardson * request (in parallel) that will do it for us on another 987*99a2dd95SBruce Richardson * thread. 988*99a2dd95SBruce Richardson * 989*99a2dd95SBruce Richardson * we also don't really care if this succeeds - the data is 990*99a2dd95SBruce Richardson * already removed from the heap, so it is, for all intents and 991*99a2dd95SBruce Richardson * purposes, hidden from the rest of DPDK even if some other 992*99a2dd95SBruce Richardson * process (including this one) may have these pages mapped. 993*99a2dd95SBruce Richardson * 994*99a2dd95SBruce Richardson * notifications about deallocated memory happen during sync. 995*99a2dd95SBruce Richardson */ 996*99a2dd95SBruce Richardson request_to_primary(&req); 997*99a2dd95SBruce Richardson } 998*99a2dd95SBruce Richardson 999*99a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n", 1000*99a2dd95SBruce Richardson msl->socket_id, aligned_len >> 20ULL); 1001*99a2dd95SBruce Richardson 1002*99a2dd95SBruce Richardson rte_mcfg_mem_write_unlock(); 1003*99a2dd95SBruce Richardson free_unlock: 1004*99a2dd95SBruce Richardson rte_spinlock_unlock(&(heap->lock)); 1005*99a2dd95SBruce Richardson return ret; 1006*99a2dd95SBruce Richardson } 1007*99a2dd95SBruce Richardson 1008*99a2dd95SBruce Richardson int 1009*99a2dd95SBruce Richardson malloc_heap_resize(struct malloc_elem *elem, size_t size) 1010*99a2dd95SBruce Richardson { 1011*99a2dd95SBruce Richardson int ret; 1012*99a2dd95SBruce Richardson 1013*99a2dd95SBruce Richardson if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) 1014*99a2dd95SBruce Richardson return -1; 1015*99a2dd95SBruce Richardson 1016*99a2dd95SBruce Richardson rte_spinlock_lock(&(elem->heap->lock)); 1017*99a2dd95SBruce Richardson 1018*99a2dd95SBruce Richardson ret = malloc_elem_resize(elem, size); 1019*99a2dd95SBruce Richardson 1020*99a2dd95SBruce Richardson rte_spinlock_unlock(&(elem->heap->lock)); 1021*99a2dd95SBruce Richardson 1022*99a2dd95SBruce Richardson return ret; 1023*99a2dd95SBruce Richardson } 1024*99a2dd95SBruce Richardson 1025*99a2dd95SBruce Richardson /* 1026*99a2dd95SBruce Richardson * Function to retrieve data for a given heap 1027*99a2dd95SBruce Richardson */ 1028*99a2dd95SBruce Richardson int 1029*99a2dd95SBruce Richardson malloc_heap_get_stats(struct malloc_heap *heap, 1030*99a2dd95SBruce Richardson struct rte_malloc_socket_stats *socket_stats) 1031*99a2dd95SBruce Richardson { 1032*99a2dd95SBruce Richardson size_t idx; 1033*99a2dd95SBruce Richardson struct malloc_elem *elem; 1034*99a2dd95SBruce Richardson 1035*99a2dd95SBruce Richardson rte_spinlock_lock(&heap->lock); 1036*99a2dd95SBruce Richardson 1037*99a2dd95SBruce Richardson /* Initialise variables for heap */ 1038*99a2dd95SBruce Richardson socket_stats->free_count = 0; 1039*99a2dd95SBruce Richardson socket_stats->heap_freesz_bytes = 0; 1040*99a2dd95SBruce Richardson socket_stats->greatest_free_size = 0; 1041*99a2dd95SBruce Richardson 1042*99a2dd95SBruce Richardson /* Iterate through free list */ 1043*99a2dd95SBruce Richardson for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { 1044*99a2dd95SBruce Richardson for (elem = LIST_FIRST(&heap->free_head[idx]); 1045*99a2dd95SBruce Richardson !!elem; elem = LIST_NEXT(elem, free_list)) 1046*99a2dd95SBruce Richardson { 1047*99a2dd95SBruce Richardson socket_stats->free_count++; 1048*99a2dd95SBruce Richardson socket_stats->heap_freesz_bytes += elem->size; 1049*99a2dd95SBruce Richardson if (elem->size > socket_stats->greatest_free_size) 1050*99a2dd95SBruce Richardson socket_stats->greatest_free_size = elem->size; 1051*99a2dd95SBruce Richardson } 1052*99a2dd95SBruce Richardson } 1053*99a2dd95SBruce Richardson /* Get stats on overall heap and allocated memory on this heap */ 1054*99a2dd95SBruce Richardson socket_stats->heap_totalsz_bytes = heap->total_size; 1055*99a2dd95SBruce Richardson socket_stats->heap_allocsz_bytes = (socket_stats->heap_totalsz_bytes - 1056*99a2dd95SBruce Richardson socket_stats->heap_freesz_bytes); 1057*99a2dd95SBruce Richardson socket_stats->alloc_count = heap->alloc_count; 1058*99a2dd95SBruce Richardson 1059*99a2dd95SBruce Richardson rte_spinlock_unlock(&heap->lock); 1060*99a2dd95SBruce Richardson return 0; 1061*99a2dd95SBruce Richardson } 1062*99a2dd95SBruce Richardson 1063*99a2dd95SBruce Richardson /* 1064*99a2dd95SBruce Richardson * Function to retrieve data for a given heap 1065*99a2dd95SBruce Richardson */ 1066*99a2dd95SBruce Richardson void 1067*99a2dd95SBruce Richardson malloc_heap_dump(struct malloc_heap *heap, FILE *f) 1068*99a2dd95SBruce Richardson { 1069*99a2dd95SBruce Richardson struct malloc_elem *elem; 1070*99a2dd95SBruce Richardson 1071*99a2dd95SBruce Richardson rte_spinlock_lock(&heap->lock); 1072*99a2dd95SBruce Richardson 1073*99a2dd95SBruce Richardson fprintf(f, "Heap size: 0x%zx\n", heap->total_size); 1074*99a2dd95SBruce Richardson fprintf(f, "Heap alloc count: %u\n", heap->alloc_count); 1075*99a2dd95SBruce Richardson 1076*99a2dd95SBruce Richardson elem = heap->first; 1077*99a2dd95SBruce Richardson while (elem) { 1078*99a2dd95SBruce Richardson malloc_elem_dump(elem, f); 1079*99a2dd95SBruce Richardson elem = elem->next; 1080*99a2dd95SBruce Richardson } 1081*99a2dd95SBruce Richardson 1082*99a2dd95SBruce Richardson rte_spinlock_unlock(&heap->lock); 1083*99a2dd95SBruce Richardson } 1084*99a2dd95SBruce Richardson 1085*99a2dd95SBruce Richardson static int 1086*99a2dd95SBruce Richardson destroy_elem(struct malloc_elem *elem, size_t len) 1087*99a2dd95SBruce Richardson { 1088*99a2dd95SBruce Richardson struct malloc_heap *heap = elem->heap; 1089*99a2dd95SBruce Richardson 1090*99a2dd95SBruce Richardson /* notify all subscribers that a memory area is going to be removed */ 1091*99a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len); 1092*99a2dd95SBruce Richardson 1093*99a2dd95SBruce Richardson /* this element can be removed */ 1094*99a2dd95SBruce Richardson malloc_elem_free_list_remove(elem); 1095*99a2dd95SBruce Richardson malloc_elem_hide_region(elem, elem, len); 1096*99a2dd95SBruce Richardson 1097*99a2dd95SBruce Richardson heap->total_size -= len; 1098*99a2dd95SBruce Richardson 1099*99a2dd95SBruce Richardson memset(elem, 0, sizeof(*elem)); 1100*99a2dd95SBruce Richardson 1101*99a2dd95SBruce Richardson return 0; 1102*99a2dd95SBruce Richardson } 1103*99a2dd95SBruce Richardson 1104*99a2dd95SBruce Richardson struct rte_memseg_list * 1105*99a2dd95SBruce Richardson malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[], 1106*99a2dd95SBruce Richardson unsigned int n_pages, size_t page_sz, const char *seg_name, 1107*99a2dd95SBruce Richardson unsigned int socket_id) 1108*99a2dd95SBruce Richardson { 1109*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1110*99a2dd95SBruce Richardson char fbarray_name[RTE_FBARRAY_NAME_LEN]; 1111*99a2dd95SBruce Richardson struct rte_memseg_list *msl = NULL; 1112*99a2dd95SBruce Richardson struct rte_fbarray *arr; 1113*99a2dd95SBruce Richardson size_t seg_len = n_pages * page_sz; 1114*99a2dd95SBruce Richardson unsigned int i; 1115*99a2dd95SBruce Richardson 1116*99a2dd95SBruce Richardson /* first, find a free memseg list */ 1117*99a2dd95SBruce Richardson for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 1118*99a2dd95SBruce Richardson struct rte_memseg_list *tmp = &mcfg->memsegs[i]; 1119*99a2dd95SBruce Richardson if (tmp->base_va == NULL) { 1120*99a2dd95SBruce Richardson msl = tmp; 1121*99a2dd95SBruce Richardson break; 1122*99a2dd95SBruce Richardson } 1123*99a2dd95SBruce Richardson } 1124*99a2dd95SBruce Richardson if (msl == NULL) { 1125*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n"); 1126*99a2dd95SBruce Richardson rte_errno = ENOSPC; 1127*99a2dd95SBruce Richardson return NULL; 1128*99a2dd95SBruce Richardson } 1129*99a2dd95SBruce Richardson 1130*99a2dd95SBruce Richardson snprintf(fbarray_name, sizeof(fbarray_name), "%s_%p", 1131*99a2dd95SBruce Richardson seg_name, va_addr); 1132*99a2dd95SBruce Richardson 1133*99a2dd95SBruce Richardson /* create the backing fbarray */ 1134*99a2dd95SBruce Richardson if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages, 1135*99a2dd95SBruce Richardson sizeof(struct rte_memseg)) < 0) { 1136*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n"); 1137*99a2dd95SBruce Richardson return NULL; 1138*99a2dd95SBruce Richardson } 1139*99a2dd95SBruce Richardson arr = &msl->memseg_arr; 1140*99a2dd95SBruce Richardson 1141*99a2dd95SBruce Richardson /* fbarray created, fill it up */ 1142*99a2dd95SBruce Richardson for (i = 0; i < n_pages; i++) { 1143*99a2dd95SBruce Richardson struct rte_memseg *ms; 1144*99a2dd95SBruce Richardson 1145*99a2dd95SBruce Richardson rte_fbarray_set_used(arr, i); 1146*99a2dd95SBruce Richardson ms = rte_fbarray_get(arr, i); 1147*99a2dd95SBruce Richardson ms->addr = RTE_PTR_ADD(va_addr, i * page_sz); 1148*99a2dd95SBruce Richardson ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i]; 1149*99a2dd95SBruce Richardson ms->hugepage_sz = page_sz; 1150*99a2dd95SBruce Richardson ms->len = page_sz; 1151*99a2dd95SBruce Richardson ms->nchannel = rte_memory_get_nchannel(); 1152*99a2dd95SBruce Richardson ms->nrank = rte_memory_get_nrank(); 1153*99a2dd95SBruce Richardson ms->socket_id = socket_id; 1154*99a2dd95SBruce Richardson } 1155*99a2dd95SBruce Richardson 1156*99a2dd95SBruce Richardson /* set up the memseg list */ 1157*99a2dd95SBruce Richardson msl->base_va = va_addr; 1158*99a2dd95SBruce Richardson msl->page_sz = page_sz; 1159*99a2dd95SBruce Richardson msl->socket_id = socket_id; 1160*99a2dd95SBruce Richardson msl->len = seg_len; 1161*99a2dd95SBruce Richardson msl->version = 0; 1162*99a2dd95SBruce Richardson msl->external = 1; 1163*99a2dd95SBruce Richardson 1164*99a2dd95SBruce Richardson return msl; 1165*99a2dd95SBruce Richardson } 1166*99a2dd95SBruce Richardson 1167*99a2dd95SBruce Richardson struct extseg_walk_arg { 1168*99a2dd95SBruce Richardson void *va_addr; 1169*99a2dd95SBruce Richardson size_t len; 1170*99a2dd95SBruce Richardson struct rte_memseg_list *msl; 1171*99a2dd95SBruce Richardson }; 1172*99a2dd95SBruce Richardson 1173*99a2dd95SBruce Richardson static int 1174*99a2dd95SBruce Richardson extseg_walk(const struct rte_memseg_list *msl, void *arg) 1175*99a2dd95SBruce Richardson { 1176*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1177*99a2dd95SBruce Richardson struct extseg_walk_arg *wa = arg; 1178*99a2dd95SBruce Richardson 1179*99a2dd95SBruce Richardson if (msl->base_va == wa->va_addr && msl->len == wa->len) { 1180*99a2dd95SBruce Richardson unsigned int found_idx; 1181*99a2dd95SBruce Richardson 1182*99a2dd95SBruce Richardson /* msl is const */ 1183*99a2dd95SBruce Richardson found_idx = msl - mcfg->memsegs; 1184*99a2dd95SBruce Richardson wa->msl = &mcfg->memsegs[found_idx]; 1185*99a2dd95SBruce Richardson return 1; 1186*99a2dd95SBruce Richardson } 1187*99a2dd95SBruce Richardson return 0; 1188*99a2dd95SBruce Richardson } 1189*99a2dd95SBruce Richardson 1190*99a2dd95SBruce Richardson struct rte_memseg_list * 1191*99a2dd95SBruce Richardson malloc_heap_find_external_seg(void *va_addr, size_t len) 1192*99a2dd95SBruce Richardson { 1193*99a2dd95SBruce Richardson struct extseg_walk_arg wa; 1194*99a2dd95SBruce Richardson int res; 1195*99a2dd95SBruce Richardson 1196*99a2dd95SBruce Richardson wa.va_addr = va_addr; 1197*99a2dd95SBruce Richardson wa.len = len; 1198*99a2dd95SBruce Richardson 1199*99a2dd95SBruce Richardson res = rte_memseg_list_walk_thread_unsafe(extseg_walk, &wa); 1200*99a2dd95SBruce Richardson 1201*99a2dd95SBruce Richardson if (res != 1) { 1202*99a2dd95SBruce Richardson /* 0 means nothing was found, -1 shouldn't happen */ 1203*99a2dd95SBruce Richardson if (res == 0) 1204*99a2dd95SBruce Richardson rte_errno = ENOENT; 1205*99a2dd95SBruce Richardson return NULL; 1206*99a2dd95SBruce Richardson } 1207*99a2dd95SBruce Richardson return wa.msl; 1208*99a2dd95SBruce Richardson } 1209*99a2dd95SBruce Richardson 1210*99a2dd95SBruce Richardson int 1211*99a2dd95SBruce Richardson malloc_heap_destroy_external_seg(struct rte_memseg_list *msl) 1212*99a2dd95SBruce Richardson { 1213*99a2dd95SBruce Richardson /* destroy the fbarray backing this memory */ 1214*99a2dd95SBruce Richardson if (rte_fbarray_destroy(&msl->memseg_arr) < 0) 1215*99a2dd95SBruce Richardson return -1; 1216*99a2dd95SBruce Richardson 1217*99a2dd95SBruce Richardson /* reset the memseg list */ 1218*99a2dd95SBruce Richardson memset(msl, 0, sizeof(*msl)); 1219*99a2dd95SBruce Richardson 1220*99a2dd95SBruce Richardson return 0; 1221*99a2dd95SBruce Richardson } 1222*99a2dd95SBruce Richardson 1223*99a2dd95SBruce Richardson int 1224*99a2dd95SBruce Richardson malloc_heap_add_external_memory(struct malloc_heap *heap, 1225*99a2dd95SBruce Richardson struct rte_memseg_list *msl) 1226*99a2dd95SBruce Richardson { 1227*99a2dd95SBruce Richardson /* erase contents of new memory */ 1228*99a2dd95SBruce Richardson memset(msl->base_va, 0, msl->len); 1229*99a2dd95SBruce Richardson 1230*99a2dd95SBruce Richardson /* now, add newly minted memory to the malloc heap */ 1231*99a2dd95SBruce Richardson malloc_heap_add_memory(heap, msl, msl->base_va, msl->len); 1232*99a2dd95SBruce Richardson 1233*99a2dd95SBruce Richardson heap->total_size += msl->len; 1234*99a2dd95SBruce Richardson 1235*99a2dd95SBruce Richardson /* all done! */ 1236*99a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n", 1237*99a2dd95SBruce Richardson heap->name, msl->base_va); 1238*99a2dd95SBruce Richardson 1239*99a2dd95SBruce Richardson /* notify all subscribers that a new memory area has been added */ 1240*99a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, 1241*99a2dd95SBruce Richardson msl->base_va, msl->len); 1242*99a2dd95SBruce Richardson 1243*99a2dd95SBruce Richardson return 0; 1244*99a2dd95SBruce Richardson } 1245*99a2dd95SBruce Richardson 1246*99a2dd95SBruce Richardson int 1247*99a2dd95SBruce Richardson malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, 1248*99a2dd95SBruce Richardson size_t len) 1249*99a2dd95SBruce Richardson { 1250*99a2dd95SBruce Richardson struct malloc_elem *elem = heap->first; 1251*99a2dd95SBruce Richardson 1252*99a2dd95SBruce Richardson /* find element with specified va address */ 1253*99a2dd95SBruce Richardson while (elem != NULL && elem != va_addr) { 1254*99a2dd95SBruce Richardson elem = elem->next; 1255*99a2dd95SBruce Richardson /* stop if we've blown past our VA */ 1256*99a2dd95SBruce Richardson if (elem > (struct malloc_elem *)va_addr) { 1257*99a2dd95SBruce Richardson rte_errno = ENOENT; 1258*99a2dd95SBruce Richardson return -1; 1259*99a2dd95SBruce Richardson } 1260*99a2dd95SBruce Richardson } 1261*99a2dd95SBruce Richardson /* check if element was found */ 1262*99a2dd95SBruce Richardson if (elem == NULL || elem->msl->len != len) { 1263*99a2dd95SBruce Richardson rte_errno = ENOENT; 1264*99a2dd95SBruce Richardson return -1; 1265*99a2dd95SBruce Richardson } 1266*99a2dd95SBruce Richardson /* if element's size is not equal to segment len, segment is busy */ 1267*99a2dd95SBruce Richardson if (elem->state == ELEM_BUSY || elem->size != len) { 1268*99a2dd95SBruce Richardson rte_errno = EBUSY; 1269*99a2dd95SBruce Richardson return -1; 1270*99a2dd95SBruce Richardson } 1271*99a2dd95SBruce Richardson return destroy_elem(elem, len); 1272*99a2dd95SBruce Richardson } 1273*99a2dd95SBruce Richardson 1274*99a2dd95SBruce Richardson int 1275*99a2dd95SBruce Richardson malloc_heap_create(struct malloc_heap *heap, const char *heap_name) 1276*99a2dd95SBruce Richardson { 1277*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1278*99a2dd95SBruce Richardson uint32_t next_socket_id = mcfg->next_socket_id; 1279*99a2dd95SBruce Richardson 1280*99a2dd95SBruce Richardson /* prevent overflow. did you really create 2 billion heaps??? */ 1281*99a2dd95SBruce Richardson if (next_socket_id > INT32_MAX) { 1282*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); 1283*99a2dd95SBruce Richardson rte_errno = ENOSPC; 1284*99a2dd95SBruce Richardson return -1; 1285*99a2dd95SBruce Richardson } 1286*99a2dd95SBruce Richardson 1287*99a2dd95SBruce Richardson /* initialize empty heap */ 1288*99a2dd95SBruce Richardson heap->alloc_count = 0; 1289*99a2dd95SBruce Richardson heap->first = NULL; 1290*99a2dd95SBruce Richardson heap->last = NULL; 1291*99a2dd95SBruce Richardson LIST_INIT(heap->free_head); 1292*99a2dd95SBruce Richardson rte_spinlock_init(&heap->lock); 1293*99a2dd95SBruce Richardson heap->total_size = 0; 1294*99a2dd95SBruce Richardson heap->socket_id = next_socket_id; 1295*99a2dd95SBruce Richardson 1296*99a2dd95SBruce Richardson /* we hold a global mem hotplug writelock, so it's safe to increment */ 1297*99a2dd95SBruce Richardson mcfg->next_socket_id++; 1298*99a2dd95SBruce Richardson 1299*99a2dd95SBruce Richardson /* set up name */ 1300*99a2dd95SBruce Richardson strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); 1301*99a2dd95SBruce Richardson return 0; 1302*99a2dd95SBruce Richardson } 1303*99a2dd95SBruce Richardson 1304*99a2dd95SBruce Richardson int 1305*99a2dd95SBruce Richardson malloc_heap_destroy(struct malloc_heap *heap) 1306*99a2dd95SBruce Richardson { 1307*99a2dd95SBruce Richardson if (heap->alloc_count != 0) { 1308*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Heap is still in use\n"); 1309*99a2dd95SBruce Richardson rte_errno = EBUSY; 1310*99a2dd95SBruce Richardson return -1; 1311*99a2dd95SBruce Richardson } 1312*99a2dd95SBruce Richardson if (heap->first != NULL || heap->last != NULL) { 1313*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Heap still contains memory segments\n"); 1314*99a2dd95SBruce Richardson rte_errno = EBUSY; 1315*99a2dd95SBruce Richardson return -1; 1316*99a2dd95SBruce Richardson } 1317*99a2dd95SBruce Richardson if (heap->total_size != 0) 1318*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n"); 1319*99a2dd95SBruce Richardson 1320*99a2dd95SBruce Richardson /* after this, the lock will be dropped */ 1321*99a2dd95SBruce Richardson memset(heap, 0, sizeof(*heap)); 1322*99a2dd95SBruce Richardson 1323*99a2dd95SBruce Richardson return 0; 1324*99a2dd95SBruce Richardson } 1325*99a2dd95SBruce Richardson 1326*99a2dd95SBruce Richardson int 1327*99a2dd95SBruce Richardson rte_eal_malloc_heap_init(void) 1328*99a2dd95SBruce Richardson { 1329*99a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 1330*99a2dd95SBruce Richardson unsigned int i; 1331*99a2dd95SBruce Richardson const struct internal_config *internal_conf = 1332*99a2dd95SBruce Richardson eal_get_internal_configuration(); 1333*99a2dd95SBruce Richardson 1334*99a2dd95SBruce Richardson if (internal_conf->match_allocations) 1335*99a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Hugepages will be freed exactly as allocated.\n"); 1336*99a2dd95SBruce Richardson 1337*99a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 1338*99a2dd95SBruce Richardson /* assign min socket ID to external heaps */ 1339*99a2dd95SBruce Richardson mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID; 1340*99a2dd95SBruce Richardson 1341*99a2dd95SBruce Richardson /* assign names to default DPDK heaps */ 1342*99a2dd95SBruce Richardson for (i = 0; i < rte_socket_count(); i++) { 1343*99a2dd95SBruce Richardson struct malloc_heap *heap = &mcfg->malloc_heaps[i]; 1344*99a2dd95SBruce Richardson char heap_name[RTE_HEAP_NAME_MAX_LEN]; 1345*99a2dd95SBruce Richardson int socket_id = rte_socket_id_by_idx(i); 1346*99a2dd95SBruce Richardson 1347*99a2dd95SBruce Richardson snprintf(heap_name, sizeof(heap_name), 1348*99a2dd95SBruce Richardson "socket_%i", socket_id); 1349*99a2dd95SBruce Richardson strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); 1350*99a2dd95SBruce Richardson heap->socket_id = socket_id; 1351*99a2dd95SBruce Richardson } 1352*99a2dd95SBruce Richardson } 1353*99a2dd95SBruce Richardson 1354*99a2dd95SBruce Richardson 1355*99a2dd95SBruce Richardson if (register_mp_requests()) { 1356*99a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); 1357*99a2dd95SBruce Richardson rte_mcfg_mem_read_unlock(); 1358*99a2dd95SBruce Richardson return -1; 1359*99a2dd95SBruce Richardson } 1360*99a2dd95SBruce Richardson 1361*99a2dd95SBruce Richardson /* unlock mem hotplug here. it's safe for primary as no requests can 1362*99a2dd95SBruce Richardson * even come before primary itself is fully initialized, and secondaries 1363*99a2dd95SBruce Richardson * do not need to initialize the heap. 1364*99a2dd95SBruce Richardson */ 1365*99a2dd95SBruce Richardson rte_mcfg_mem_read_unlock(); 1366*99a2dd95SBruce Richardson 1367*99a2dd95SBruce Richardson /* secondary process does not need to initialize anything */ 1368*99a2dd95SBruce Richardson if (rte_eal_process_type() != RTE_PROC_PRIMARY) 1369*99a2dd95SBruce Richardson return 0; 1370*99a2dd95SBruce Richardson 1371*99a2dd95SBruce Richardson /* add all IOVA-contiguous areas to the heap */ 1372*99a2dd95SBruce Richardson return rte_memseg_contig_walk(malloc_add_seg, NULL); 1373*99a2dd95SBruce Richardson } 1374