199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 299a2dd95SBruce Richardson * Copyright(c) 2010-2014 Intel Corporation 399a2dd95SBruce Richardson */ 499a2dd95SBruce Richardson #include <stdint.h> 599a2dd95SBruce Richardson #include <stddef.h> 699a2dd95SBruce Richardson #include <stdlib.h> 799a2dd95SBruce Richardson #include <stdio.h> 899a2dd95SBruce Richardson #include <errno.h> 999a2dd95SBruce Richardson #include <sys/queue.h> 1099a2dd95SBruce Richardson 1199a2dd95SBruce Richardson #include <rte_memory.h> 1299a2dd95SBruce Richardson #include <rte_errno.h> 1399a2dd95SBruce Richardson #include <rte_eal.h> 1499a2dd95SBruce Richardson #include <rte_eal_memconfig.h> 1599a2dd95SBruce Richardson #include <rte_lcore.h> 1699a2dd95SBruce Richardson #include <rte_common.h> 1799a2dd95SBruce Richardson #include <rte_string_fns.h> 1899a2dd95SBruce Richardson #include <rte_spinlock.h> 1999a2dd95SBruce Richardson #include <rte_memzone.h> 2099a2dd95SBruce Richardson #include <rte_fbarray.h> 2199a2dd95SBruce Richardson 2299a2dd95SBruce Richardson #include "eal_internal_cfg.h" 2399a2dd95SBruce Richardson #include "eal_memalloc.h" 2499a2dd95SBruce Richardson #include "eal_memcfg.h" 2599a2dd95SBruce Richardson #include "eal_private.h" 2699a2dd95SBruce Richardson #include "malloc_elem.h" 2799a2dd95SBruce Richardson #include "malloc_heap.h" 2899a2dd95SBruce Richardson #include "malloc_mp.h" 2999a2dd95SBruce Richardson 3099a2dd95SBruce Richardson /* start external socket ID's at a very high number */ 3199a2dd95SBruce Richardson #define CONST_MAX(a, b) (a > b ? a : b) /* RTE_MAX is not a constant */ 3299a2dd95SBruce Richardson #define EXTERNAL_HEAP_MIN_SOCKET_ID (CONST_MAX((1 << 8), RTE_MAX_NUMA_NODES)) 3399a2dd95SBruce Richardson 3499a2dd95SBruce Richardson static unsigned 3599a2dd95SBruce Richardson check_hugepage_sz(unsigned flags, uint64_t hugepage_sz) 3699a2dd95SBruce Richardson { 3799a2dd95SBruce Richardson unsigned check_flag = 0; 3899a2dd95SBruce Richardson 3999a2dd95SBruce Richardson if (!(flags & ~RTE_MEMZONE_SIZE_HINT_ONLY)) 4099a2dd95SBruce Richardson return 1; 4199a2dd95SBruce Richardson 4299a2dd95SBruce Richardson switch (hugepage_sz) { 4399a2dd95SBruce Richardson case RTE_PGSIZE_256K: 4499a2dd95SBruce Richardson check_flag = RTE_MEMZONE_256KB; 4599a2dd95SBruce Richardson break; 4699a2dd95SBruce Richardson case RTE_PGSIZE_2M: 4799a2dd95SBruce Richardson check_flag = RTE_MEMZONE_2MB; 4899a2dd95SBruce Richardson break; 4999a2dd95SBruce Richardson case RTE_PGSIZE_16M: 5099a2dd95SBruce Richardson check_flag = RTE_MEMZONE_16MB; 5199a2dd95SBruce Richardson break; 5299a2dd95SBruce Richardson case RTE_PGSIZE_256M: 5399a2dd95SBruce Richardson check_flag = RTE_MEMZONE_256MB; 5499a2dd95SBruce Richardson break; 5599a2dd95SBruce Richardson case RTE_PGSIZE_512M: 5699a2dd95SBruce Richardson check_flag = RTE_MEMZONE_512MB; 5799a2dd95SBruce Richardson break; 5899a2dd95SBruce Richardson case RTE_PGSIZE_1G: 5999a2dd95SBruce Richardson check_flag = RTE_MEMZONE_1GB; 6099a2dd95SBruce Richardson break; 6199a2dd95SBruce Richardson case RTE_PGSIZE_4G: 6299a2dd95SBruce Richardson check_flag = RTE_MEMZONE_4GB; 6399a2dd95SBruce Richardson break; 6499a2dd95SBruce Richardson case RTE_PGSIZE_16G: 6599a2dd95SBruce Richardson check_flag = RTE_MEMZONE_16GB; 6699a2dd95SBruce Richardson } 6799a2dd95SBruce Richardson 6899a2dd95SBruce Richardson return check_flag & flags; 6999a2dd95SBruce Richardson } 7099a2dd95SBruce Richardson 7199a2dd95SBruce Richardson int 7299a2dd95SBruce Richardson malloc_socket_to_heap_id(unsigned int socket_id) 7399a2dd95SBruce Richardson { 7499a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 7599a2dd95SBruce Richardson int i; 7699a2dd95SBruce Richardson 7799a2dd95SBruce Richardson for (i = 0; i < RTE_MAX_HEAPS; i++) { 7899a2dd95SBruce Richardson struct malloc_heap *heap = &mcfg->malloc_heaps[i]; 7999a2dd95SBruce Richardson 8099a2dd95SBruce Richardson if (heap->socket_id == socket_id) 8199a2dd95SBruce Richardson return i; 8299a2dd95SBruce Richardson } 8399a2dd95SBruce Richardson return -1; 8499a2dd95SBruce Richardson } 8599a2dd95SBruce Richardson 8699a2dd95SBruce Richardson /* 8799a2dd95SBruce Richardson * Expand the heap with a memory area. 8899a2dd95SBruce Richardson */ 8999a2dd95SBruce Richardson static struct malloc_elem * 9099a2dd95SBruce Richardson malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl, 912edd037cSDmitry Kozlyuk void *start, size_t len, bool dirty) 9299a2dd95SBruce Richardson { 9399a2dd95SBruce Richardson struct malloc_elem *elem = start; 9499a2dd95SBruce Richardson 952edd037cSDmitry Kozlyuk malloc_elem_init(elem, heap, msl, len, elem, len, dirty); 9699a2dd95SBruce Richardson 9799a2dd95SBruce Richardson malloc_elem_insert(elem); 9899a2dd95SBruce Richardson 9999a2dd95SBruce Richardson elem = malloc_elem_join_adjacent_free(elem); 10099a2dd95SBruce Richardson 10199a2dd95SBruce Richardson malloc_elem_free_list_insert(elem); 10299a2dd95SBruce Richardson 10399a2dd95SBruce Richardson return elem; 10499a2dd95SBruce Richardson } 10599a2dd95SBruce Richardson 10699a2dd95SBruce Richardson static int 10799a2dd95SBruce Richardson malloc_add_seg(const struct rte_memseg_list *msl, 10899a2dd95SBruce Richardson const struct rte_memseg *ms, size_t len, void *arg __rte_unused) 10999a2dd95SBruce Richardson { 11099a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 11199a2dd95SBruce Richardson struct rte_memseg_list *found_msl; 11299a2dd95SBruce Richardson struct malloc_heap *heap; 11399a2dd95SBruce Richardson int msl_idx, heap_idx; 11499a2dd95SBruce Richardson 11599a2dd95SBruce Richardson if (msl->external) 11699a2dd95SBruce Richardson return 0; 11799a2dd95SBruce Richardson 11899a2dd95SBruce Richardson heap_idx = malloc_socket_to_heap_id(msl->socket_id); 11999a2dd95SBruce Richardson if (heap_idx < 0) { 12099a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Memseg list has invalid socket id\n"); 12199a2dd95SBruce Richardson return -1; 12299a2dd95SBruce Richardson } 12399a2dd95SBruce Richardson heap = &mcfg->malloc_heaps[heap_idx]; 12499a2dd95SBruce Richardson 12599a2dd95SBruce Richardson /* msl is const, so find it */ 12699a2dd95SBruce Richardson msl_idx = msl - mcfg->memsegs; 12799a2dd95SBruce Richardson 12899a2dd95SBruce Richardson if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) 12999a2dd95SBruce Richardson return -1; 13099a2dd95SBruce Richardson 13199a2dd95SBruce Richardson found_msl = &mcfg->memsegs[msl_idx]; 13299a2dd95SBruce Richardson 1332edd037cSDmitry Kozlyuk malloc_heap_add_memory(heap, found_msl, ms->addr, len, 1342edd037cSDmitry Kozlyuk ms->flags & RTE_MEMSEG_FLAG_DIRTY); 13599a2dd95SBruce Richardson 13699a2dd95SBruce Richardson heap->total_size += len; 13799a2dd95SBruce Richardson 13899a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Added %zuM to heap on socket %i\n", len >> 20, 13999a2dd95SBruce Richardson msl->socket_id); 14099a2dd95SBruce Richardson return 0; 14199a2dd95SBruce Richardson } 14299a2dd95SBruce Richardson 14399a2dd95SBruce Richardson /* 14499a2dd95SBruce Richardson * Iterates through the freelist for a heap to find a free element 14599a2dd95SBruce Richardson * which can store data of the required size and with the requested alignment. 14699a2dd95SBruce Richardson * If size is 0, find the biggest available elem. 14799a2dd95SBruce Richardson * Returns null on failure, or pointer to element on success. 14899a2dd95SBruce Richardson */ 14999a2dd95SBruce Richardson static struct malloc_elem * 15099a2dd95SBruce Richardson find_suitable_element(struct malloc_heap *heap, size_t size, 15199a2dd95SBruce Richardson unsigned int flags, size_t align, size_t bound, bool contig) 15299a2dd95SBruce Richardson { 15399a2dd95SBruce Richardson size_t idx; 15499a2dd95SBruce Richardson struct malloc_elem *elem, *alt_elem = NULL; 15599a2dd95SBruce Richardson 15699a2dd95SBruce Richardson for (idx = malloc_elem_free_list_index(size); 15799a2dd95SBruce Richardson idx < RTE_HEAP_NUM_FREELISTS; idx++) { 15899a2dd95SBruce Richardson for (elem = LIST_FIRST(&heap->free_head[idx]); 15999a2dd95SBruce Richardson !!elem; elem = LIST_NEXT(elem, free_list)) { 16099a2dd95SBruce Richardson if (malloc_elem_can_hold(elem, size, align, bound, 16199a2dd95SBruce Richardson contig)) { 16299a2dd95SBruce Richardson if (check_hugepage_sz(flags, 16399a2dd95SBruce Richardson elem->msl->page_sz)) 16499a2dd95SBruce Richardson return elem; 16599a2dd95SBruce Richardson if (alt_elem == NULL) 16699a2dd95SBruce Richardson alt_elem = elem; 16799a2dd95SBruce Richardson } 16899a2dd95SBruce Richardson } 16999a2dd95SBruce Richardson } 17099a2dd95SBruce Richardson 17199a2dd95SBruce Richardson if ((alt_elem != NULL) && (flags & RTE_MEMZONE_SIZE_HINT_ONLY)) 17299a2dd95SBruce Richardson return alt_elem; 17399a2dd95SBruce Richardson 17499a2dd95SBruce Richardson return NULL; 17599a2dd95SBruce Richardson } 17699a2dd95SBruce Richardson 17799a2dd95SBruce Richardson /* 17899a2dd95SBruce Richardson * Iterates through the freelist for a heap to find a free element with the 17999a2dd95SBruce Richardson * biggest size and requested alignment. Will also set size to whatever element 18099a2dd95SBruce Richardson * size that was found. 18199a2dd95SBruce Richardson * Returns null on failure, or pointer to element on success. 18299a2dd95SBruce Richardson */ 18399a2dd95SBruce Richardson static struct malloc_elem * 18499a2dd95SBruce Richardson find_biggest_element(struct malloc_heap *heap, size_t *size, 18599a2dd95SBruce Richardson unsigned int flags, size_t align, bool contig) 18699a2dd95SBruce Richardson { 18799a2dd95SBruce Richardson struct malloc_elem *elem, *max_elem = NULL; 18899a2dd95SBruce Richardson size_t idx, max_size = 0; 18999a2dd95SBruce Richardson 19099a2dd95SBruce Richardson for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { 19199a2dd95SBruce Richardson for (elem = LIST_FIRST(&heap->free_head[idx]); 19299a2dd95SBruce Richardson !!elem; elem = LIST_NEXT(elem, free_list)) { 19399a2dd95SBruce Richardson size_t cur_size; 19499a2dd95SBruce Richardson if ((flags & RTE_MEMZONE_SIZE_HINT_ONLY) == 0 && 19599a2dd95SBruce Richardson !check_hugepage_sz(flags, 19699a2dd95SBruce Richardson elem->msl->page_sz)) 19799a2dd95SBruce Richardson continue; 19899a2dd95SBruce Richardson if (contig) { 19999a2dd95SBruce Richardson cur_size = 20099a2dd95SBruce Richardson malloc_elem_find_max_iova_contig(elem, 20199a2dd95SBruce Richardson align); 20299a2dd95SBruce Richardson } else { 20399a2dd95SBruce Richardson void *data_start = RTE_PTR_ADD(elem, 20499a2dd95SBruce Richardson MALLOC_ELEM_HEADER_LEN); 20599a2dd95SBruce Richardson void *data_end = RTE_PTR_ADD(elem, elem->size - 20699a2dd95SBruce Richardson MALLOC_ELEM_TRAILER_LEN); 20799a2dd95SBruce Richardson void *aligned = RTE_PTR_ALIGN_CEIL(data_start, 20899a2dd95SBruce Richardson align); 20999a2dd95SBruce Richardson /* check if aligned data start is beyond end */ 21099a2dd95SBruce Richardson if (aligned >= data_end) 21199a2dd95SBruce Richardson continue; 21299a2dd95SBruce Richardson cur_size = RTE_PTR_DIFF(data_end, aligned); 21399a2dd95SBruce Richardson } 21499a2dd95SBruce Richardson if (cur_size > max_size) { 21599a2dd95SBruce Richardson max_size = cur_size; 21699a2dd95SBruce Richardson max_elem = elem; 21799a2dd95SBruce Richardson } 21899a2dd95SBruce Richardson } 21999a2dd95SBruce Richardson } 22099a2dd95SBruce Richardson 22199a2dd95SBruce Richardson *size = max_size; 22299a2dd95SBruce Richardson return max_elem; 22399a2dd95SBruce Richardson } 22499a2dd95SBruce Richardson 22599a2dd95SBruce Richardson /* 22699a2dd95SBruce Richardson * Main function to allocate a block of memory from the heap. 22799a2dd95SBruce Richardson * It locks the free list, scans it, and adds a new memseg if the 22899a2dd95SBruce Richardson * scan fails. Once the new memseg is added, it re-scans and should return 22999a2dd95SBruce Richardson * the new element after releasing the lock. 23099a2dd95SBruce Richardson */ 23199a2dd95SBruce Richardson static void * 23299a2dd95SBruce Richardson heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, 23399a2dd95SBruce Richardson unsigned int flags, size_t align, size_t bound, bool contig) 23499a2dd95SBruce Richardson { 23599a2dd95SBruce Richardson struct malloc_elem *elem; 2366cc51b12SZhihong Peng size_t user_size = size; 23799a2dd95SBruce Richardson 23899a2dd95SBruce Richardson size = RTE_CACHE_LINE_ROUNDUP(size); 23999a2dd95SBruce Richardson align = RTE_CACHE_LINE_ROUNDUP(align); 24099a2dd95SBruce Richardson 24199a2dd95SBruce Richardson /* roundup might cause an overflow */ 24299a2dd95SBruce Richardson if (size == 0) 24399a2dd95SBruce Richardson return NULL; 24499a2dd95SBruce Richardson elem = find_suitable_element(heap, size, flags, align, bound, contig); 24599a2dd95SBruce Richardson if (elem != NULL) { 24699a2dd95SBruce Richardson elem = malloc_elem_alloc(elem, size, align, bound, contig); 24799a2dd95SBruce Richardson 24899a2dd95SBruce Richardson /* increase heap's count of allocated elements */ 24999a2dd95SBruce Richardson heap->alloc_count++; 2506cc51b12SZhihong Peng 2516cc51b12SZhihong Peng asan_set_redzone(elem, user_size); 25299a2dd95SBruce Richardson } 25399a2dd95SBruce Richardson 25499a2dd95SBruce Richardson return elem == NULL ? NULL : (void *)(&elem[1]); 25599a2dd95SBruce Richardson } 25699a2dd95SBruce Richardson 25799a2dd95SBruce Richardson static void * 25899a2dd95SBruce Richardson heap_alloc_biggest(struct malloc_heap *heap, const char *type __rte_unused, 25999a2dd95SBruce Richardson unsigned int flags, size_t align, bool contig) 26099a2dd95SBruce Richardson { 26199a2dd95SBruce Richardson struct malloc_elem *elem; 26299a2dd95SBruce Richardson size_t size; 26399a2dd95SBruce Richardson 26499a2dd95SBruce Richardson align = RTE_CACHE_LINE_ROUNDUP(align); 26599a2dd95SBruce Richardson 26699a2dd95SBruce Richardson elem = find_biggest_element(heap, &size, flags, align, contig); 26799a2dd95SBruce Richardson if (elem != NULL) { 26899a2dd95SBruce Richardson elem = malloc_elem_alloc(elem, size, align, 0, contig); 26999a2dd95SBruce Richardson 27099a2dd95SBruce Richardson /* increase heap's count of allocated elements */ 27199a2dd95SBruce Richardson heap->alloc_count++; 2726cc51b12SZhihong Peng 2736cc51b12SZhihong Peng asan_set_redzone(elem, size); 27499a2dd95SBruce Richardson } 27599a2dd95SBruce Richardson 27699a2dd95SBruce Richardson return elem == NULL ? NULL : (void *)(&elem[1]); 27799a2dd95SBruce Richardson } 27899a2dd95SBruce Richardson 27999a2dd95SBruce Richardson /* this function is exposed in malloc_mp.h */ 28099a2dd95SBruce Richardson void 28199a2dd95SBruce Richardson rollback_expand_heap(struct rte_memseg **ms, int n_segs, 28299a2dd95SBruce Richardson struct malloc_elem *elem, void *map_addr, size_t map_len) 28399a2dd95SBruce Richardson { 28499a2dd95SBruce Richardson if (elem != NULL) { 28599a2dd95SBruce Richardson malloc_elem_free_list_remove(elem); 28699a2dd95SBruce Richardson malloc_elem_hide_region(elem, map_addr, map_len); 28799a2dd95SBruce Richardson } 28899a2dd95SBruce Richardson 28999a2dd95SBruce Richardson eal_memalloc_free_seg_bulk(ms, n_segs); 29099a2dd95SBruce Richardson } 29199a2dd95SBruce Richardson 29299a2dd95SBruce Richardson /* this function is exposed in malloc_mp.h */ 29399a2dd95SBruce Richardson struct malloc_elem * 29499a2dd95SBruce Richardson alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, 29599a2dd95SBruce Richardson int socket, unsigned int flags, size_t align, size_t bound, 29699a2dd95SBruce Richardson bool contig, struct rte_memseg **ms, int n_segs) 29799a2dd95SBruce Richardson { 29899a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 29999a2dd95SBruce Richardson struct rte_memseg_list *msl; 30099a2dd95SBruce Richardson struct malloc_elem *elem = NULL; 30199a2dd95SBruce Richardson size_t alloc_sz; 3022edd037cSDmitry Kozlyuk int allocd_pages, i; 3032edd037cSDmitry Kozlyuk bool dirty = false; 30499a2dd95SBruce Richardson void *ret, *map_addr; 30599a2dd95SBruce Richardson 30699a2dd95SBruce Richardson alloc_sz = (size_t)pg_sz * n_segs; 30799a2dd95SBruce Richardson 30899a2dd95SBruce Richardson /* first, check if we're allowed to allocate this memory */ 30999a2dd95SBruce Richardson if (eal_memalloc_mem_alloc_validate(socket, 31099a2dd95SBruce Richardson heap->total_size + alloc_sz) < 0) { 31199a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "User has disallowed allocation\n"); 31299a2dd95SBruce Richardson return NULL; 31399a2dd95SBruce Richardson } 31499a2dd95SBruce Richardson 31599a2dd95SBruce Richardson allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz, 31699a2dd95SBruce Richardson socket, true); 31799a2dd95SBruce Richardson 31899a2dd95SBruce Richardson /* make sure we've allocated our pages... */ 31999a2dd95SBruce Richardson if (allocd_pages < 0) 32099a2dd95SBruce Richardson return NULL; 32199a2dd95SBruce Richardson 32299a2dd95SBruce Richardson map_addr = ms[0]->addr; 32399a2dd95SBruce Richardson msl = rte_mem_virt2memseg_list(map_addr); 32499a2dd95SBruce Richardson 32599a2dd95SBruce Richardson /* check if we wanted contiguous memory but didn't get it */ 32699a2dd95SBruce Richardson if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) { 32799a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n", 32899a2dd95SBruce Richardson __func__); 32999a2dd95SBruce Richardson goto fail; 33099a2dd95SBruce Richardson } 33199a2dd95SBruce Richardson 33299a2dd95SBruce Richardson /* 33399a2dd95SBruce Richardson * Once we have all the memseg lists configured, if there is a dma mask 33499a2dd95SBruce Richardson * set, check iova addresses are not out of range. Otherwise the device 33599a2dd95SBruce Richardson * setting the dma mask could have problems with the mapped memory. 33699a2dd95SBruce Richardson * 33799a2dd95SBruce Richardson * There are two situations when this can happen: 33899a2dd95SBruce Richardson * 1) memory initialization 33999a2dd95SBruce Richardson * 2) dynamic memory allocation 34099a2dd95SBruce Richardson * 34199a2dd95SBruce Richardson * For 1), an error when checking dma mask implies app can not be 34299a2dd95SBruce Richardson * executed. For 2) implies the new memory can not be added. 34399a2dd95SBruce Richardson */ 34499a2dd95SBruce Richardson if (mcfg->dma_maskbits && 34599a2dd95SBruce Richardson rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { 34699a2dd95SBruce Richardson /* 34799a2dd95SBruce Richardson * Currently this can only happen if IOMMU is enabled 34899a2dd95SBruce Richardson * and the address width supported by the IOMMU hw is 34999a2dd95SBruce Richardson * not enough for using the memory mapped IOVAs. 35099a2dd95SBruce Richardson * 35199a2dd95SBruce Richardson * If IOVA is VA, advice to try with '--iova-mode pa' 35299a2dd95SBruce Richardson * which could solve some situations when IOVA VA is not 35399a2dd95SBruce Richardson * really needed. 35499a2dd95SBruce Richardson */ 35599a2dd95SBruce Richardson RTE_LOG(ERR, EAL, 35699a2dd95SBruce Richardson "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask\n", 35799a2dd95SBruce Richardson __func__); 35899a2dd95SBruce Richardson 35999a2dd95SBruce Richardson /* 36099a2dd95SBruce Richardson * If IOVA is VA and it is possible to run with IOVA PA, 36199a2dd95SBruce Richardson * because user is root, give and advice for solving the 36299a2dd95SBruce Richardson * problem. 36399a2dd95SBruce Richardson */ 36499a2dd95SBruce Richardson if ((rte_eal_iova_mode() == RTE_IOVA_VA) && 36599a2dd95SBruce Richardson rte_eal_using_phys_addrs()) 36699a2dd95SBruce Richardson RTE_LOG(ERR, EAL, 36799a2dd95SBruce Richardson "%s(): Please try initializing EAL with --iova-mode=pa parameter\n", 36899a2dd95SBruce Richardson __func__); 36999a2dd95SBruce Richardson goto fail; 37099a2dd95SBruce Richardson } 37199a2dd95SBruce Richardson 3722edd037cSDmitry Kozlyuk /* Element is dirty if it contains at least one dirty page. */ 3732edd037cSDmitry Kozlyuk for (i = 0; i < allocd_pages; i++) 3742edd037cSDmitry Kozlyuk dirty |= ms[i]->flags & RTE_MEMSEG_FLAG_DIRTY; 3752edd037cSDmitry Kozlyuk 37699a2dd95SBruce Richardson /* add newly minted memsegs to malloc heap */ 3772edd037cSDmitry Kozlyuk elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz, dirty); 37899a2dd95SBruce Richardson 37999a2dd95SBruce Richardson /* try once more, as now we have allocated new memory */ 38099a2dd95SBruce Richardson ret = find_suitable_element(heap, elt_size, flags, align, bound, 38199a2dd95SBruce Richardson contig); 38299a2dd95SBruce Richardson 38399a2dd95SBruce Richardson if (ret == NULL) 38499a2dd95SBruce Richardson goto fail; 38599a2dd95SBruce Richardson 38699a2dd95SBruce Richardson return elem; 38799a2dd95SBruce Richardson 38899a2dd95SBruce Richardson fail: 38999a2dd95SBruce Richardson rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); 39099a2dd95SBruce Richardson return NULL; 39199a2dd95SBruce Richardson } 39299a2dd95SBruce Richardson 39399a2dd95SBruce Richardson static int 39499a2dd95SBruce Richardson try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz, 39599a2dd95SBruce Richardson size_t elt_size, int socket, unsigned int flags, size_t align, 39699a2dd95SBruce Richardson size_t bound, bool contig) 39799a2dd95SBruce Richardson { 39899a2dd95SBruce Richardson struct malloc_elem *elem; 39999a2dd95SBruce Richardson struct rte_memseg **ms; 40099a2dd95SBruce Richardson void *map_addr; 40199a2dd95SBruce Richardson size_t alloc_sz; 40299a2dd95SBruce Richardson int n_segs; 40399a2dd95SBruce Richardson bool callback_triggered = false; 40499a2dd95SBruce Richardson 40599a2dd95SBruce Richardson alloc_sz = RTE_ALIGN_CEIL(align + elt_size + 406*ce2f7d47SFidaullah Noonari MALLOC_ELEM_OVERHEAD, pg_sz); 40799a2dd95SBruce Richardson n_segs = alloc_sz / pg_sz; 40899a2dd95SBruce Richardson 40999a2dd95SBruce Richardson /* we can't know in advance how many pages we'll need, so we malloc */ 41099a2dd95SBruce Richardson ms = malloc(sizeof(*ms) * n_segs); 41199a2dd95SBruce Richardson if (ms == NULL) 41299a2dd95SBruce Richardson return -1; 41399a2dd95SBruce Richardson memset(ms, 0, sizeof(*ms) * n_segs); 41499a2dd95SBruce Richardson 41599a2dd95SBruce Richardson elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align, 41699a2dd95SBruce Richardson bound, contig, ms, n_segs); 41799a2dd95SBruce Richardson 41899a2dd95SBruce Richardson if (elem == NULL) 41999a2dd95SBruce Richardson goto free_ms; 42099a2dd95SBruce Richardson 42199a2dd95SBruce Richardson map_addr = ms[0]->addr; 42299a2dd95SBruce Richardson 42399a2dd95SBruce Richardson /* notify user about changes in memory map */ 42499a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, map_addr, alloc_sz); 42599a2dd95SBruce Richardson 42699a2dd95SBruce Richardson /* notify other processes that this has happened */ 42799a2dd95SBruce Richardson if (request_sync()) { 42899a2dd95SBruce Richardson /* we couldn't ensure all processes have mapped memory, 42999a2dd95SBruce Richardson * so free it back and notify everyone that it's been 43099a2dd95SBruce Richardson * freed back. 43199a2dd95SBruce Richardson * 43299a2dd95SBruce Richardson * technically, we could've avoided adding memory addresses to 43399a2dd95SBruce Richardson * the map, but that would've led to inconsistent behavior 43499a2dd95SBruce Richardson * between primary and secondary processes, as those get 43599a2dd95SBruce Richardson * callbacks during sync. therefore, force primary process to 43699a2dd95SBruce Richardson * do alloc-and-rollback syncs as well. 43799a2dd95SBruce Richardson */ 43899a2dd95SBruce Richardson callback_triggered = true; 43999a2dd95SBruce Richardson goto free_elem; 44099a2dd95SBruce Richardson } 44199a2dd95SBruce Richardson heap->total_size += alloc_sz; 44299a2dd95SBruce Richardson 44399a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n", 44499a2dd95SBruce Richardson socket, alloc_sz >> 20ULL); 44599a2dd95SBruce Richardson 44699a2dd95SBruce Richardson free(ms); 44799a2dd95SBruce Richardson 44899a2dd95SBruce Richardson return 0; 44999a2dd95SBruce Richardson 45099a2dd95SBruce Richardson free_elem: 45199a2dd95SBruce Richardson if (callback_triggered) 45299a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 45399a2dd95SBruce Richardson map_addr, alloc_sz); 45499a2dd95SBruce Richardson 45599a2dd95SBruce Richardson rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz); 45699a2dd95SBruce Richardson 45799a2dd95SBruce Richardson request_sync(); 45899a2dd95SBruce Richardson free_ms: 45999a2dd95SBruce Richardson free(ms); 46099a2dd95SBruce Richardson 46199a2dd95SBruce Richardson return -1; 46299a2dd95SBruce Richardson } 46399a2dd95SBruce Richardson 46499a2dd95SBruce Richardson static int 46599a2dd95SBruce Richardson try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz, 46699a2dd95SBruce Richardson size_t elt_size, int socket, unsigned int flags, size_t align, 46799a2dd95SBruce Richardson size_t bound, bool contig) 46899a2dd95SBruce Richardson { 46999a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 47099a2dd95SBruce Richardson struct malloc_mp_req req; 47199a2dd95SBruce Richardson int req_result; 47299a2dd95SBruce Richardson 47399a2dd95SBruce Richardson memset(&req, 0, sizeof(req)); 47499a2dd95SBruce Richardson 47599a2dd95SBruce Richardson req.t = REQ_TYPE_ALLOC; 47699a2dd95SBruce Richardson req.alloc_req.align = align; 47799a2dd95SBruce Richardson req.alloc_req.bound = bound; 47899a2dd95SBruce Richardson req.alloc_req.contig = contig; 47999a2dd95SBruce Richardson req.alloc_req.flags = flags; 48099a2dd95SBruce Richardson req.alloc_req.elt_size = elt_size; 48199a2dd95SBruce Richardson req.alloc_req.page_sz = pg_sz; 48299a2dd95SBruce Richardson req.alloc_req.socket = socket; 48399a2dd95SBruce Richardson req.alloc_req.malloc_heap_idx = heap - mcfg->malloc_heaps; 48499a2dd95SBruce Richardson 48599a2dd95SBruce Richardson req_result = request_to_primary(&req); 48699a2dd95SBruce Richardson 48799a2dd95SBruce Richardson if (req_result != 0) 48899a2dd95SBruce Richardson return -1; 48999a2dd95SBruce Richardson 49099a2dd95SBruce Richardson if (req.result != REQ_RESULT_SUCCESS) 49199a2dd95SBruce Richardson return -1; 49299a2dd95SBruce Richardson 49399a2dd95SBruce Richardson return 0; 49499a2dd95SBruce Richardson } 49599a2dd95SBruce Richardson 49699a2dd95SBruce Richardson static int 49799a2dd95SBruce Richardson try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size, 49899a2dd95SBruce Richardson int socket, unsigned int flags, size_t align, size_t bound, 49999a2dd95SBruce Richardson bool contig) 50099a2dd95SBruce Richardson { 50199a2dd95SBruce Richardson int ret; 50299a2dd95SBruce Richardson 50399a2dd95SBruce Richardson rte_mcfg_mem_write_lock(); 50499a2dd95SBruce Richardson 50599a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 50699a2dd95SBruce Richardson ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket, 50799a2dd95SBruce Richardson flags, align, bound, contig); 50899a2dd95SBruce Richardson } else { 50999a2dd95SBruce Richardson ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket, 51099a2dd95SBruce Richardson flags, align, bound, contig); 51199a2dd95SBruce Richardson } 51299a2dd95SBruce Richardson 51399a2dd95SBruce Richardson rte_mcfg_mem_write_unlock(); 51499a2dd95SBruce Richardson return ret; 51599a2dd95SBruce Richardson } 51699a2dd95SBruce Richardson 51799a2dd95SBruce Richardson static int 51899a2dd95SBruce Richardson compare_pagesz(const void *a, const void *b) 51999a2dd95SBruce Richardson { 52099a2dd95SBruce Richardson const struct rte_memseg_list * const*mpa = a; 52199a2dd95SBruce Richardson const struct rte_memseg_list * const*mpb = b; 52299a2dd95SBruce Richardson const struct rte_memseg_list *msla = *mpa; 52399a2dd95SBruce Richardson const struct rte_memseg_list *mslb = *mpb; 52499a2dd95SBruce Richardson uint64_t pg_sz_a = msla->page_sz; 52599a2dd95SBruce Richardson uint64_t pg_sz_b = mslb->page_sz; 52699a2dd95SBruce Richardson 52799a2dd95SBruce Richardson if (pg_sz_a < pg_sz_b) 52899a2dd95SBruce Richardson return -1; 52999a2dd95SBruce Richardson if (pg_sz_a > pg_sz_b) 53099a2dd95SBruce Richardson return 1; 53199a2dd95SBruce Richardson return 0; 53299a2dd95SBruce Richardson } 53399a2dd95SBruce Richardson 53499a2dd95SBruce Richardson static int 53599a2dd95SBruce Richardson alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket, 53699a2dd95SBruce Richardson unsigned int flags, size_t align, size_t bound, bool contig) 53799a2dd95SBruce Richardson { 53899a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 53999a2dd95SBruce Richardson struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS]; 54099a2dd95SBruce Richardson struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS]; 54199a2dd95SBruce Richardson uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS]; 54299a2dd95SBruce Richardson uint64_t other_pg_sz[RTE_MAX_MEMSEG_LISTS]; 54399a2dd95SBruce Richardson uint64_t prev_pg_sz; 54499a2dd95SBruce Richardson int i, n_other_msls, n_other_pg_sz, n_requested_msls, n_requested_pg_sz; 54599a2dd95SBruce Richardson bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0; 54699a2dd95SBruce Richardson unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; 54799a2dd95SBruce Richardson void *ret; 54899a2dd95SBruce Richardson 54999a2dd95SBruce Richardson memset(requested_msls, 0, sizeof(requested_msls)); 55099a2dd95SBruce Richardson memset(other_msls, 0, sizeof(other_msls)); 55199a2dd95SBruce Richardson memset(requested_pg_sz, 0, sizeof(requested_pg_sz)); 55299a2dd95SBruce Richardson memset(other_pg_sz, 0, sizeof(other_pg_sz)); 55399a2dd95SBruce Richardson 55499a2dd95SBruce Richardson /* 55599a2dd95SBruce Richardson * go through memseg list and take note of all the page sizes available, 55699a2dd95SBruce Richardson * and if any of them were specifically requested by the user. 55799a2dd95SBruce Richardson */ 55899a2dd95SBruce Richardson n_requested_msls = 0; 55999a2dd95SBruce Richardson n_other_msls = 0; 56099a2dd95SBruce Richardson for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 56199a2dd95SBruce Richardson struct rte_memseg_list *msl = &mcfg->memsegs[i]; 56299a2dd95SBruce Richardson 56399a2dd95SBruce Richardson if (msl->socket_id != socket) 56499a2dd95SBruce Richardson continue; 56599a2dd95SBruce Richardson 56699a2dd95SBruce Richardson if (msl->base_va == NULL) 56799a2dd95SBruce Richardson continue; 56899a2dd95SBruce Richardson 56999a2dd95SBruce Richardson /* if pages of specific size were requested */ 57099a2dd95SBruce Richardson if (size_flags != 0 && check_hugepage_sz(size_flags, 57199a2dd95SBruce Richardson msl->page_sz)) 57299a2dd95SBruce Richardson requested_msls[n_requested_msls++] = msl; 57399a2dd95SBruce Richardson else if (size_flags == 0 || size_hint) 57499a2dd95SBruce Richardson other_msls[n_other_msls++] = msl; 57599a2dd95SBruce Richardson } 57699a2dd95SBruce Richardson 57799a2dd95SBruce Richardson /* sort the lists, smallest first */ 57899a2dd95SBruce Richardson qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]), 57999a2dd95SBruce Richardson compare_pagesz); 58099a2dd95SBruce Richardson qsort(other_msls, n_other_msls, sizeof(other_msls[0]), 58199a2dd95SBruce Richardson compare_pagesz); 58299a2dd95SBruce Richardson 58399a2dd95SBruce Richardson /* now, extract page sizes we are supposed to try */ 58499a2dd95SBruce Richardson prev_pg_sz = 0; 58599a2dd95SBruce Richardson n_requested_pg_sz = 0; 58699a2dd95SBruce Richardson for (i = 0; i < n_requested_msls; i++) { 58799a2dd95SBruce Richardson uint64_t pg_sz = requested_msls[i]->page_sz; 58899a2dd95SBruce Richardson 58999a2dd95SBruce Richardson if (prev_pg_sz != pg_sz) { 59099a2dd95SBruce Richardson requested_pg_sz[n_requested_pg_sz++] = pg_sz; 59199a2dd95SBruce Richardson prev_pg_sz = pg_sz; 59299a2dd95SBruce Richardson } 59399a2dd95SBruce Richardson } 59499a2dd95SBruce Richardson prev_pg_sz = 0; 59599a2dd95SBruce Richardson n_other_pg_sz = 0; 59699a2dd95SBruce Richardson for (i = 0; i < n_other_msls; i++) { 59799a2dd95SBruce Richardson uint64_t pg_sz = other_msls[i]->page_sz; 59899a2dd95SBruce Richardson 59999a2dd95SBruce Richardson if (prev_pg_sz != pg_sz) { 60099a2dd95SBruce Richardson other_pg_sz[n_other_pg_sz++] = pg_sz; 60199a2dd95SBruce Richardson prev_pg_sz = pg_sz; 60299a2dd95SBruce Richardson } 60399a2dd95SBruce Richardson } 60499a2dd95SBruce Richardson 60599a2dd95SBruce Richardson /* finally, try allocating memory of specified page sizes, starting from 60699a2dd95SBruce Richardson * the smallest sizes 60799a2dd95SBruce Richardson */ 60899a2dd95SBruce Richardson for (i = 0; i < n_requested_pg_sz; i++) { 60999a2dd95SBruce Richardson uint64_t pg_sz = requested_pg_sz[i]; 61099a2dd95SBruce Richardson 61199a2dd95SBruce Richardson /* 61299a2dd95SBruce Richardson * do not pass the size hint here, as user expects other page 61399a2dd95SBruce Richardson * sizes first, before resorting to best effort allocation. 61499a2dd95SBruce Richardson */ 61599a2dd95SBruce Richardson if (!try_expand_heap(heap, pg_sz, size, socket, size_flags, 61699a2dd95SBruce Richardson align, bound, contig)) 61799a2dd95SBruce Richardson return 0; 61899a2dd95SBruce Richardson } 61999a2dd95SBruce Richardson if (n_other_pg_sz == 0) 62099a2dd95SBruce Richardson return -1; 62199a2dd95SBruce Richardson 62299a2dd95SBruce Richardson /* now, check if we can reserve anything with size hint */ 62399a2dd95SBruce Richardson ret = find_suitable_element(heap, size, flags, align, bound, contig); 62499a2dd95SBruce Richardson if (ret != NULL) 62599a2dd95SBruce Richardson return 0; 62699a2dd95SBruce Richardson 62799a2dd95SBruce Richardson /* 62899a2dd95SBruce Richardson * we still couldn't reserve memory, so try expanding heap with other 62999a2dd95SBruce Richardson * page sizes, if there are any 63099a2dd95SBruce Richardson */ 63199a2dd95SBruce Richardson for (i = 0; i < n_other_pg_sz; i++) { 63299a2dd95SBruce Richardson uint64_t pg_sz = other_pg_sz[i]; 63399a2dd95SBruce Richardson 63499a2dd95SBruce Richardson if (!try_expand_heap(heap, pg_sz, size, socket, flags, 63599a2dd95SBruce Richardson align, bound, contig)) 63699a2dd95SBruce Richardson return 0; 63799a2dd95SBruce Richardson } 63899a2dd95SBruce Richardson return -1; 63999a2dd95SBruce Richardson } 64099a2dd95SBruce Richardson 64199a2dd95SBruce Richardson /* this will try lower page sizes first */ 64299a2dd95SBruce Richardson static void * 64399a2dd95SBruce Richardson malloc_heap_alloc_on_heap_id(const char *type, size_t size, 64499a2dd95SBruce Richardson unsigned int heap_id, unsigned int flags, size_t align, 64599a2dd95SBruce Richardson size_t bound, bool contig) 64699a2dd95SBruce Richardson { 64799a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 64899a2dd95SBruce Richardson struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; 64999a2dd95SBruce Richardson unsigned int size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY; 65099a2dd95SBruce Richardson int socket_id; 65199a2dd95SBruce Richardson void *ret; 65299a2dd95SBruce Richardson const struct internal_config *internal_conf = 65399a2dd95SBruce Richardson eal_get_internal_configuration(); 65499a2dd95SBruce Richardson 65599a2dd95SBruce Richardson rte_spinlock_lock(&(heap->lock)); 65699a2dd95SBruce Richardson 65799a2dd95SBruce Richardson align = align == 0 ? 1 : align; 65899a2dd95SBruce Richardson 65999a2dd95SBruce Richardson /* for legacy mode, try once and with all flags */ 66099a2dd95SBruce Richardson if (internal_conf->legacy_mem) { 66199a2dd95SBruce Richardson ret = heap_alloc(heap, type, size, flags, align, bound, contig); 66299a2dd95SBruce Richardson goto alloc_unlock; 66399a2dd95SBruce Richardson } 66499a2dd95SBruce Richardson 66599a2dd95SBruce Richardson /* 66699a2dd95SBruce Richardson * we do not pass the size hint here, because even if allocation fails, 66799a2dd95SBruce Richardson * we may still be able to allocate memory from appropriate page sizes, 66899a2dd95SBruce Richardson * we just need to request more memory first. 66999a2dd95SBruce Richardson */ 67099a2dd95SBruce Richardson 67199a2dd95SBruce Richardson socket_id = rte_socket_id_by_idx(heap_id); 67299a2dd95SBruce Richardson /* 67399a2dd95SBruce Richardson * if socket ID is negative, we cannot find a socket ID for this heap - 67499a2dd95SBruce Richardson * which means it's an external heap. those can have unexpected page 67599a2dd95SBruce Richardson * sizes, so if the user asked to allocate from there - assume user 67699a2dd95SBruce Richardson * knows what they're doing, and allow allocating from there with any 67799a2dd95SBruce Richardson * page size flags. 67899a2dd95SBruce Richardson */ 67999a2dd95SBruce Richardson if (socket_id < 0) 68099a2dd95SBruce Richardson size_flags |= RTE_MEMZONE_SIZE_HINT_ONLY; 68199a2dd95SBruce Richardson 68299a2dd95SBruce Richardson ret = heap_alloc(heap, type, size, size_flags, align, bound, contig); 68399a2dd95SBruce Richardson if (ret != NULL) 68499a2dd95SBruce Richardson goto alloc_unlock; 68599a2dd95SBruce Richardson 68699a2dd95SBruce Richardson /* if socket ID is invalid, this is an external heap */ 68799a2dd95SBruce Richardson if (socket_id < 0) 68899a2dd95SBruce Richardson goto alloc_unlock; 68999a2dd95SBruce Richardson 69099a2dd95SBruce Richardson if (!alloc_more_mem_on_socket(heap, size, socket_id, flags, align, 69199a2dd95SBruce Richardson bound, contig)) { 69299a2dd95SBruce Richardson ret = heap_alloc(heap, type, size, flags, align, bound, contig); 69399a2dd95SBruce Richardson 69499a2dd95SBruce Richardson /* this should have succeeded */ 69599a2dd95SBruce Richardson if (ret == NULL) 69699a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Error allocating from heap\n"); 69799a2dd95SBruce Richardson } 69899a2dd95SBruce Richardson alloc_unlock: 69999a2dd95SBruce Richardson rte_spinlock_unlock(&(heap->lock)); 70099a2dd95SBruce Richardson return ret; 70199a2dd95SBruce Richardson } 70299a2dd95SBruce Richardson 703770d41bfSIlyes Ben Hamouda static unsigned int 704770d41bfSIlyes Ben Hamouda malloc_get_numa_socket(void) 705770d41bfSIlyes Ben Hamouda { 706770d41bfSIlyes Ben Hamouda const struct internal_config *conf = eal_get_internal_configuration(); 707770d41bfSIlyes Ben Hamouda unsigned int socket_id = rte_socket_id(); 708770d41bfSIlyes Ben Hamouda unsigned int idx; 709770d41bfSIlyes Ben Hamouda 710770d41bfSIlyes Ben Hamouda if (socket_id != (unsigned int)SOCKET_ID_ANY) 711770d41bfSIlyes Ben Hamouda return socket_id; 712770d41bfSIlyes Ben Hamouda 713770d41bfSIlyes Ben Hamouda /* for control threads, return first socket where memory is available */ 714770d41bfSIlyes Ben Hamouda for (idx = 0; idx < rte_socket_count(); idx++) { 715770d41bfSIlyes Ben Hamouda socket_id = rte_socket_id_by_idx(idx); 716770d41bfSIlyes Ben Hamouda if (conf->socket_mem[socket_id] != 0) 717770d41bfSIlyes Ben Hamouda return socket_id; 718770d41bfSIlyes Ben Hamouda } 719770d41bfSIlyes Ben Hamouda 720770d41bfSIlyes Ben Hamouda return rte_socket_id_by_idx(0); 721770d41bfSIlyes Ben Hamouda } 722770d41bfSIlyes Ben Hamouda 72399a2dd95SBruce Richardson void * 72499a2dd95SBruce Richardson malloc_heap_alloc(const char *type, size_t size, int socket_arg, 72599a2dd95SBruce Richardson unsigned int flags, size_t align, size_t bound, bool contig) 72699a2dd95SBruce Richardson { 72799a2dd95SBruce Richardson int socket, heap_id, i; 72899a2dd95SBruce Richardson void *ret; 72999a2dd95SBruce Richardson 73099a2dd95SBruce Richardson /* return NULL if size is 0 or alignment is not power-of-2 */ 73199a2dd95SBruce Richardson if (size == 0 || (align && !rte_is_power_of_2(align))) 73299a2dd95SBruce Richardson return NULL; 73399a2dd95SBruce Richardson 73499a2dd95SBruce Richardson if (!rte_eal_has_hugepages() && socket_arg < RTE_MAX_NUMA_NODES) 73599a2dd95SBruce Richardson socket_arg = SOCKET_ID_ANY; 73699a2dd95SBruce Richardson 73799a2dd95SBruce Richardson if (socket_arg == SOCKET_ID_ANY) 73899a2dd95SBruce Richardson socket = malloc_get_numa_socket(); 73999a2dd95SBruce Richardson else 74099a2dd95SBruce Richardson socket = socket_arg; 74199a2dd95SBruce Richardson 74299a2dd95SBruce Richardson /* turn socket ID into heap ID */ 74399a2dd95SBruce Richardson heap_id = malloc_socket_to_heap_id(socket); 74499a2dd95SBruce Richardson /* if heap id is negative, socket ID was invalid */ 74599a2dd95SBruce Richardson if (heap_id < 0) 74699a2dd95SBruce Richardson return NULL; 74799a2dd95SBruce Richardson 74899a2dd95SBruce Richardson ret = malloc_heap_alloc_on_heap_id(type, size, heap_id, flags, align, 74999a2dd95SBruce Richardson bound, contig); 75099a2dd95SBruce Richardson if (ret != NULL || socket_arg != SOCKET_ID_ANY) 75199a2dd95SBruce Richardson return ret; 75299a2dd95SBruce Richardson 75399a2dd95SBruce Richardson /* try other heaps. we are only iterating through native DPDK sockets, 75499a2dd95SBruce Richardson * so external heaps won't be included. 75599a2dd95SBruce Richardson */ 75699a2dd95SBruce Richardson for (i = 0; i < (int) rte_socket_count(); i++) { 75799a2dd95SBruce Richardson if (i == heap_id) 75899a2dd95SBruce Richardson continue; 75999a2dd95SBruce Richardson ret = malloc_heap_alloc_on_heap_id(type, size, i, flags, align, 76099a2dd95SBruce Richardson bound, contig); 76199a2dd95SBruce Richardson if (ret != NULL) 76299a2dd95SBruce Richardson return ret; 76399a2dd95SBruce Richardson } 76499a2dd95SBruce Richardson return NULL; 76599a2dd95SBruce Richardson } 76699a2dd95SBruce Richardson 76799a2dd95SBruce Richardson static void * 76899a2dd95SBruce Richardson heap_alloc_biggest_on_heap_id(const char *type, unsigned int heap_id, 76999a2dd95SBruce Richardson unsigned int flags, size_t align, bool contig) 77099a2dd95SBruce Richardson { 77199a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 77299a2dd95SBruce Richardson struct malloc_heap *heap = &mcfg->malloc_heaps[heap_id]; 77399a2dd95SBruce Richardson void *ret; 77499a2dd95SBruce Richardson 77599a2dd95SBruce Richardson rte_spinlock_lock(&(heap->lock)); 77699a2dd95SBruce Richardson 77799a2dd95SBruce Richardson align = align == 0 ? 1 : align; 77899a2dd95SBruce Richardson 77999a2dd95SBruce Richardson ret = heap_alloc_biggest(heap, type, flags, align, contig); 78099a2dd95SBruce Richardson 78199a2dd95SBruce Richardson rte_spinlock_unlock(&(heap->lock)); 78299a2dd95SBruce Richardson 78399a2dd95SBruce Richardson return ret; 78499a2dd95SBruce Richardson } 78599a2dd95SBruce Richardson 78699a2dd95SBruce Richardson void * 78799a2dd95SBruce Richardson malloc_heap_alloc_biggest(const char *type, int socket_arg, unsigned int flags, 78899a2dd95SBruce Richardson size_t align, bool contig) 78999a2dd95SBruce Richardson { 79099a2dd95SBruce Richardson int socket, i, cur_socket, heap_id; 79199a2dd95SBruce Richardson void *ret; 79299a2dd95SBruce Richardson 79399a2dd95SBruce Richardson /* return NULL if align is not power-of-2 */ 79499a2dd95SBruce Richardson if ((align && !rte_is_power_of_2(align))) 79599a2dd95SBruce Richardson return NULL; 79699a2dd95SBruce Richardson 79799a2dd95SBruce Richardson if (!rte_eal_has_hugepages()) 79899a2dd95SBruce Richardson socket_arg = SOCKET_ID_ANY; 79999a2dd95SBruce Richardson 80099a2dd95SBruce Richardson if (socket_arg == SOCKET_ID_ANY) 80199a2dd95SBruce Richardson socket = malloc_get_numa_socket(); 80299a2dd95SBruce Richardson else 80399a2dd95SBruce Richardson socket = socket_arg; 80499a2dd95SBruce Richardson 80599a2dd95SBruce Richardson /* turn socket ID into heap ID */ 80699a2dd95SBruce Richardson heap_id = malloc_socket_to_heap_id(socket); 80799a2dd95SBruce Richardson /* if heap id is negative, socket ID was invalid */ 80899a2dd95SBruce Richardson if (heap_id < 0) 80999a2dd95SBruce Richardson return NULL; 81099a2dd95SBruce Richardson 81199a2dd95SBruce Richardson ret = heap_alloc_biggest_on_heap_id(type, heap_id, flags, align, 81299a2dd95SBruce Richardson contig); 81399a2dd95SBruce Richardson if (ret != NULL || socket_arg != SOCKET_ID_ANY) 81499a2dd95SBruce Richardson return ret; 81599a2dd95SBruce Richardson 81699a2dd95SBruce Richardson /* try other heaps */ 81799a2dd95SBruce Richardson for (i = 0; i < (int) rte_socket_count(); i++) { 81899a2dd95SBruce Richardson cur_socket = rte_socket_id_by_idx(i); 81999a2dd95SBruce Richardson if (cur_socket == socket) 82099a2dd95SBruce Richardson continue; 82199a2dd95SBruce Richardson ret = heap_alloc_biggest_on_heap_id(type, i, flags, align, 82299a2dd95SBruce Richardson contig); 82399a2dd95SBruce Richardson if (ret != NULL) 82499a2dd95SBruce Richardson return ret; 82599a2dd95SBruce Richardson } 82699a2dd95SBruce Richardson return NULL; 82799a2dd95SBruce Richardson } 82899a2dd95SBruce Richardson 82999a2dd95SBruce Richardson /* this function is exposed in malloc_mp.h */ 83099a2dd95SBruce Richardson int 83199a2dd95SBruce Richardson malloc_heap_free_pages(void *aligned_start, size_t aligned_len) 83299a2dd95SBruce Richardson { 83399a2dd95SBruce Richardson int n_segs, seg_idx, max_seg_idx; 83499a2dd95SBruce Richardson struct rte_memseg_list *msl; 83599a2dd95SBruce Richardson size_t page_sz; 83699a2dd95SBruce Richardson 83799a2dd95SBruce Richardson msl = rte_mem_virt2memseg_list(aligned_start); 83899a2dd95SBruce Richardson if (msl == NULL) 83999a2dd95SBruce Richardson return -1; 84099a2dd95SBruce Richardson 84199a2dd95SBruce Richardson page_sz = (size_t)msl->page_sz; 84299a2dd95SBruce Richardson n_segs = aligned_len / page_sz; 84399a2dd95SBruce Richardson seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz; 84499a2dd95SBruce Richardson max_seg_idx = seg_idx + n_segs; 84599a2dd95SBruce Richardson 84699a2dd95SBruce Richardson for (; seg_idx < max_seg_idx; seg_idx++) { 84799a2dd95SBruce Richardson struct rte_memseg *ms; 84899a2dd95SBruce Richardson 84999a2dd95SBruce Richardson ms = rte_fbarray_get(&msl->memseg_arr, seg_idx); 85099a2dd95SBruce Richardson eal_memalloc_free_seg(ms); 85199a2dd95SBruce Richardson } 85299a2dd95SBruce Richardson return 0; 85399a2dd95SBruce Richardson } 85499a2dd95SBruce Richardson 85599a2dd95SBruce Richardson int 85699a2dd95SBruce Richardson malloc_heap_free(struct malloc_elem *elem) 85799a2dd95SBruce Richardson { 85899a2dd95SBruce Richardson struct malloc_heap *heap; 85999a2dd95SBruce Richardson void *start, *aligned_start, *end, *aligned_end; 86099a2dd95SBruce Richardson size_t len, aligned_len, page_sz; 86199a2dd95SBruce Richardson struct rte_memseg_list *msl; 86299a2dd95SBruce Richardson unsigned int i, n_segs, before_space, after_space; 86399a2dd95SBruce Richardson int ret; 8644d8bdd8bSAnatoly Burakov bool unmapped = false; 86599a2dd95SBruce Richardson const struct internal_config *internal_conf = 86699a2dd95SBruce Richardson eal_get_internal_configuration(); 86799a2dd95SBruce Richardson 86899a2dd95SBruce Richardson if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) 86999a2dd95SBruce Richardson return -1; 87099a2dd95SBruce Richardson 8716cc51b12SZhihong Peng asan_clear_redzone(elem); 8726cc51b12SZhihong Peng 87399a2dd95SBruce Richardson /* elem may be merged with previous element, so keep heap address */ 87499a2dd95SBruce Richardson heap = elem->heap; 87599a2dd95SBruce Richardson msl = elem->msl; 87699a2dd95SBruce Richardson page_sz = (size_t)msl->page_sz; 87799a2dd95SBruce Richardson 87899a2dd95SBruce Richardson rte_spinlock_lock(&(heap->lock)); 87999a2dd95SBruce Richardson 8806cc51b12SZhihong Peng void *asan_ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN + elem->pad); 8816cc51b12SZhihong Peng size_t asan_data_len = elem->size - MALLOC_ELEM_OVERHEAD - elem->pad; 8826cc51b12SZhihong Peng 88399a2dd95SBruce Richardson /* mark element as free */ 88499a2dd95SBruce Richardson elem->state = ELEM_FREE; 88599a2dd95SBruce Richardson 88699a2dd95SBruce Richardson elem = malloc_elem_free(elem); 88799a2dd95SBruce Richardson 88899a2dd95SBruce Richardson /* anything after this is a bonus */ 88999a2dd95SBruce Richardson ret = 0; 89099a2dd95SBruce Richardson 89199a2dd95SBruce Richardson /* ...of which we can't avail if we are in legacy mode, or if this is an 89299a2dd95SBruce Richardson * externally allocated segment. 89399a2dd95SBruce Richardson */ 89499a2dd95SBruce Richardson if (internal_conf->legacy_mem || (msl->external > 0)) 89599a2dd95SBruce Richardson goto free_unlock; 89699a2dd95SBruce Richardson 89799a2dd95SBruce Richardson /* check if we can free any memory back to the system */ 89899a2dd95SBruce Richardson if (elem->size < page_sz) 89999a2dd95SBruce Richardson goto free_unlock; 90099a2dd95SBruce Richardson 90199a2dd95SBruce Richardson /* if user requested to match allocations, the sizes must match - if not, 90299a2dd95SBruce Richardson * we will defer freeing these hugepages until the entire original allocation 90399a2dd95SBruce Richardson * can be freed 90499a2dd95SBruce Richardson */ 90599a2dd95SBruce Richardson if (internal_conf->match_allocations && elem->size != elem->orig_size) 90699a2dd95SBruce Richardson goto free_unlock; 90799a2dd95SBruce Richardson 90899a2dd95SBruce Richardson /* probably, but let's make sure, as we may not be using up full page */ 90999a2dd95SBruce Richardson start = elem; 91099a2dd95SBruce Richardson len = elem->size; 91199a2dd95SBruce Richardson aligned_start = RTE_PTR_ALIGN_CEIL(start, page_sz); 91299a2dd95SBruce Richardson end = RTE_PTR_ADD(elem, len); 91399a2dd95SBruce Richardson aligned_end = RTE_PTR_ALIGN_FLOOR(end, page_sz); 91499a2dd95SBruce Richardson 91599a2dd95SBruce Richardson aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); 91699a2dd95SBruce Richardson 91799a2dd95SBruce Richardson /* can't free anything */ 91899a2dd95SBruce Richardson if (aligned_len < page_sz) 91999a2dd95SBruce Richardson goto free_unlock; 92099a2dd95SBruce Richardson 92199a2dd95SBruce Richardson /* we can free something. however, some of these pages may be marked as 92299a2dd95SBruce Richardson * unfreeable, so also check that as well 92399a2dd95SBruce Richardson */ 92499a2dd95SBruce Richardson n_segs = aligned_len / page_sz; 92599a2dd95SBruce Richardson for (i = 0; i < n_segs; i++) { 92699a2dd95SBruce Richardson const struct rte_memseg *tmp = 92799a2dd95SBruce Richardson rte_mem_virt2memseg(aligned_start, msl); 92899a2dd95SBruce Richardson 92999a2dd95SBruce Richardson if (tmp->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { 93099a2dd95SBruce Richardson /* this is an unfreeable segment, so move start */ 93199a2dd95SBruce Richardson aligned_start = RTE_PTR_ADD(tmp->addr, tmp->len); 93299a2dd95SBruce Richardson } 93399a2dd95SBruce Richardson } 93499a2dd95SBruce Richardson 93599a2dd95SBruce Richardson /* recalculate length and number of segments */ 93699a2dd95SBruce Richardson aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start); 93799a2dd95SBruce Richardson n_segs = aligned_len / page_sz; 93899a2dd95SBruce Richardson 93999a2dd95SBruce Richardson /* check if we can still free some pages */ 94099a2dd95SBruce Richardson if (n_segs == 0) 94199a2dd95SBruce Richardson goto free_unlock; 94299a2dd95SBruce Richardson 94399a2dd95SBruce Richardson /* We're not done yet. We also have to check if by freeing space we will 94499a2dd95SBruce Richardson * be leaving free elements that are too small to store new elements. 94599a2dd95SBruce Richardson * Check if we have enough space in the beginning and at the end, or if 94699a2dd95SBruce Richardson * start/end are exactly page aligned. 94799a2dd95SBruce Richardson */ 94899a2dd95SBruce Richardson before_space = RTE_PTR_DIFF(aligned_start, elem); 94999a2dd95SBruce Richardson after_space = RTE_PTR_DIFF(end, aligned_end); 95099a2dd95SBruce Richardson if (before_space != 0 && 95199a2dd95SBruce Richardson before_space < MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { 95299a2dd95SBruce Richardson /* There is not enough space before start, but we may be able to 95399a2dd95SBruce Richardson * move the start forward by one page. 95499a2dd95SBruce Richardson */ 95599a2dd95SBruce Richardson if (n_segs == 1) 95699a2dd95SBruce Richardson goto free_unlock; 95799a2dd95SBruce Richardson 95899a2dd95SBruce Richardson /* move start */ 95999a2dd95SBruce Richardson aligned_start = RTE_PTR_ADD(aligned_start, page_sz); 96099a2dd95SBruce Richardson aligned_len -= page_sz; 96199a2dd95SBruce Richardson n_segs--; 96299a2dd95SBruce Richardson } 96399a2dd95SBruce Richardson if (after_space != 0 && after_space < 96499a2dd95SBruce Richardson MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) { 96599a2dd95SBruce Richardson /* There is not enough space after end, but we may be able to 96699a2dd95SBruce Richardson * move the end backwards by one page. 96799a2dd95SBruce Richardson */ 96899a2dd95SBruce Richardson if (n_segs == 1) 96999a2dd95SBruce Richardson goto free_unlock; 97099a2dd95SBruce Richardson 97199a2dd95SBruce Richardson /* move end */ 97299a2dd95SBruce Richardson aligned_end = RTE_PTR_SUB(aligned_end, page_sz); 97399a2dd95SBruce Richardson aligned_len -= page_sz; 97499a2dd95SBruce Richardson n_segs--; 97599a2dd95SBruce Richardson } 97699a2dd95SBruce Richardson 97799a2dd95SBruce Richardson /* now we can finally free us some pages */ 97899a2dd95SBruce Richardson 97999a2dd95SBruce Richardson rte_mcfg_mem_write_lock(); 98099a2dd95SBruce Richardson 98199a2dd95SBruce Richardson /* 98299a2dd95SBruce Richardson * we allow secondary processes to clear the heap of this allocated 98399a2dd95SBruce Richardson * memory because it is safe to do so, as even if notifications about 98499a2dd95SBruce Richardson * unmapped pages don't make it to other processes, heap is shared 98599a2dd95SBruce Richardson * across all processes, and will become empty of this memory anyway, 98699a2dd95SBruce Richardson * and nothing can allocate it back unless primary process will be able 98799a2dd95SBruce Richardson * to deliver allocation message to every single running process. 98899a2dd95SBruce Richardson */ 98999a2dd95SBruce Richardson 99099a2dd95SBruce Richardson malloc_elem_free_list_remove(elem); 99199a2dd95SBruce Richardson 99299a2dd95SBruce Richardson malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len); 99399a2dd95SBruce Richardson 99499a2dd95SBruce Richardson heap->total_size -= aligned_len; 99599a2dd95SBruce Richardson 99699a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 99799a2dd95SBruce Richardson /* notify user about changes in memory map */ 99899a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, 99999a2dd95SBruce Richardson aligned_start, aligned_len); 100099a2dd95SBruce Richardson 100199a2dd95SBruce Richardson /* don't care if any of this fails */ 100299a2dd95SBruce Richardson malloc_heap_free_pages(aligned_start, aligned_len); 100399a2dd95SBruce Richardson 100499a2dd95SBruce Richardson request_sync(); 100599a2dd95SBruce Richardson } else { 100699a2dd95SBruce Richardson struct malloc_mp_req req; 100799a2dd95SBruce Richardson 100899a2dd95SBruce Richardson memset(&req, 0, sizeof(req)); 100999a2dd95SBruce Richardson 101099a2dd95SBruce Richardson req.t = REQ_TYPE_FREE; 101199a2dd95SBruce Richardson req.free_req.addr = aligned_start; 101299a2dd95SBruce Richardson req.free_req.len = aligned_len; 101399a2dd95SBruce Richardson 101499a2dd95SBruce Richardson /* 101599a2dd95SBruce Richardson * we request primary to deallocate pages, but we don't do it 101699a2dd95SBruce Richardson * in this thread. instead, we notify primary that we would like 101799a2dd95SBruce Richardson * to deallocate pages, and this process will receive another 101899a2dd95SBruce Richardson * request (in parallel) that will do it for us on another 101999a2dd95SBruce Richardson * thread. 102099a2dd95SBruce Richardson * 102199a2dd95SBruce Richardson * we also don't really care if this succeeds - the data is 102299a2dd95SBruce Richardson * already removed from the heap, so it is, for all intents and 102399a2dd95SBruce Richardson * purposes, hidden from the rest of DPDK even if some other 102499a2dd95SBruce Richardson * process (including this one) may have these pages mapped. 102599a2dd95SBruce Richardson * 102699a2dd95SBruce Richardson * notifications about deallocated memory happen during sync. 102799a2dd95SBruce Richardson */ 102899a2dd95SBruce Richardson request_to_primary(&req); 102999a2dd95SBruce Richardson } 103099a2dd95SBruce Richardson 10314d8bdd8bSAnatoly Burakov /* we didn't exit early, meaning we have unmapped some pages */ 10324d8bdd8bSAnatoly Burakov unmapped = true; 10334d8bdd8bSAnatoly Burakov 103499a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n", 103599a2dd95SBruce Richardson msl->socket_id, aligned_len >> 20ULL); 103699a2dd95SBruce Richardson 103799a2dd95SBruce Richardson rte_mcfg_mem_write_unlock(); 103899a2dd95SBruce Richardson free_unlock: 10396cc51b12SZhihong Peng asan_set_freezone(asan_ptr, asan_data_len); 10406cc51b12SZhihong Peng 10414d8bdd8bSAnatoly Burakov /* if we unmapped some memory, we need to do additional work for ASan */ 10424d8bdd8bSAnatoly Burakov if (unmapped) { 10434d8bdd8bSAnatoly Burakov void *asan_end = RTE_PTR_ADD(asan_ptr, asan_data_len); 10444d8bdd8bSAnatoly Burakov void *aligned_end = RTE_PTR_ADD(aligned_start, aligned_len); 10454d8bdd8bSAnatoly Burakov void *aligned_trailer = RTE_PTR_SUB(aligned_start, 10464d8bdd8bSAnatoly Burakov MALLOC_ELEM_TRAILER_LEN); 10474d8bdd8bSAnatoly Burakov 10484d8bdd8bSAnatoly Burakov /* 10494d8bdd8bSAnatoly Burakov * There was a memory area that was unmapped. This memory area 10504d8bdd8bSAnatoly Burakov * will have to be marked as available for ASan, because we will 10514d8bdd8bSAnatoly Burakov * want to use it next time it gets mapped again. The OS memory 10524d8bdd8bSAnatoly Burakov * protection should trigger a fault on access to these areas 10534d8bdd8bSAnatoly Burakov * anyway, so we are not giving up any protection. 10544d8bdd8bSAnatoly Burakov */ 10554d8bdd8bSAnatoly Burakov asan_set_zone(aligned_start, aligned_len, 0x00); 10564d8bdd8bSAnatoly Burakov 10574d8bdd8bSAnatoly Burakov /* 10584d8bdd8bSAnatoly Burakov * ...however, when we unmap pages, we create new free elements 10594d8bdd8bSAnatoly Burakov * which might have been marked as "freed" with an earlier 10604d8bdd8bSAnatoly Burakov * `asan_set_freezone` call. So, if there is an area past the 10614d8bdd8bSAnatoly Burakov * unmapped space that was marked as freezone for ASan, we need 10624d8bdd8bSAnatoly Burakov * to mark the malloc header as available. 10634d8bdd8bSAnatoly Burakov */ 10644d8bdd8bSAnatoly Burakov if (asan_end > aligned_end) 10654d8bdd8bSAnatoly Burakov asan_set_zone(aligned_end, MALLOC_ELEM_HEADER_LEN, 0x00); 10664d8bdd8bSAnatoly Burakov 10674d8bdd8bSAnatoly Burakov /* if there's space before unmapped memory, mark as available */ 10684d8bdd8bSAnatoly Burakov if (asan_ptr < aligned_start) 10694d8bdd8bSAnatoly Burakov asan_set_zone(aligned_trailer, MALLOC_ELEM_TRAILER_LEN, 0x00); 10704d8bdd8bSAnatoly Burakov } 10714d8bdd8bSAnatoly Burakov 107299a2dd95SBruce Richardson rte_spinlock_unlock(&(heap->lock)); 107399a2dd95SBruce Richardson return ret; 107499a2dd95SBruce Richardson } 107599a2dd95SBruce Richardson 107699a2dd95SBruce Richardson int 107799a2dd95SBruce Richardson malloc_heap_resize(struct malloc_elem *elem, size_t size) 107899a2dd95SBruce Richardson { 107999a2dd95SBruce Richardson int ret; 108099a2dd95SBruce Richardson 108199a2dd95SBruce Richardson if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY) 108299a2dd95SBruce Richardson return -1; 108399a2dd95SBruce Richardson 108499a2dd95SBruce Richardson rte_spinlock_lock(&(elem->heap->lock)); 108599a2dd95SBruce Richardson 108699a2dd95SBruce Richardson ret = malloc_elem_resize(elem, size); 108799a2dd95SBruce Richardson 108899a2dd95SBruce Richardson rte_spinlock_unlock(&(elem->heap->lock)); 108999a2dd95SBruce Richardson 109099a2dd95SBruce Richardson return ret; 109199a2dd95SBruce Richardson } 109299a2dd95SBruce Richardson 109399a2dd95SBruce Richardson /* 109499a2dd95SBruce Richardson * Function to retrieve data for a given heap 109599a2dd95SBruce Richardson */ 109699a2dd95SBruce Richardson int 109799a2dd95SBruce Richardson malloc_heap_get_stats(struct malloc_heap *heap, 109899a2dd95SBruce Richardson struct rte_malloc_socket_stats *socket_stats) 109999a2dd95SBruce Richardson { 110099a2dd95SBruce Richardson size_t idx; 110199a2dd95SBruce Richardson struct malloc_elem *elem; 110299a2dd95SBruce Richardson 110399a2dd95SBruce Richardson rte_spinlock_lock(&heap->lock); 110499a2dd95SBruce Richardson 110599a2dd95SBruce Richardson /* Initialise variables for heap */ 110699a2dd95SBruce Richardson socket_stats->free_count = 0; 110799a2dd95SBruce Richardson socket_stats->heap_freesz_bytes = 0; 110899a2dd95SBruce Richardson socket_stats->greatest_free_size = 0; 110999a2dd95SBruce Richardson 111099a2dd95SBruce Richardson /* Iterate through free list */ 111199a2dd95SBruce Richardson for (idx = 0; idx < RTE_HEAP_NUM_FREELISTS; idx++) { 111299a2dd95SBruce Richardson for (elem = LIST_FIRST(&heap->free_head[idx]); 111399a2dd95SBruce Richardson !!elem; elem = LIST_NEXT(elem, free_list)) 111499a2dd95SBruce Richardson { 111599a2dd95SBruce Richardson socket_stats->free_count++; 111699a2dd95SBruce Richardson socket_stats->heap_freesz_bytes += elem->size; 111799a2dd95SBruce Richardson if (elem->size > socket_stats->greatest_free_size) 111899a2dd95SBruce Richardson socket_stats->greatest_free_size = elem->size; 111999a2dd95SBruce Richardson } 112099a2dd95SBruce Richardson } 112199a2dd95SBruce Richardson /* Get stats on overall heap and allocated memory on this heap */ 112299a2dd95SBruce Richardson socket_stats->heap_totalsz_bytes = heap->total_size; 112399a2dd95SBruce Richardson socket_stats->heap_allocsz_bytes = (socket_stats->heap_totalsz_bytes - 112499a2dd95SBruce Richardson socket_stats->heap_freesz_bytes); 112599a2dd95SBruce Richardson socket_stats->alloc_count = heap->alloc_count; 112699a2dd95SBruce Richardson 112799a2dd95SBruce Richardson rte_spinlock_unlock(&heap->lock); 112899a2dd95SBruce Richardson return 0; 112999a2dd95SBruce Richardson } 113099a2dd95SBruce Richardson 113199a2dd95SBruce Richardson /* 113299a2dd95SBruce Richardson * Function to retrieve data for a given heap 113399a2dd95SBruce Richardson */ 113499a2dd95SBruce Richardson void 113599a2dd95SBruce Richardson malloc_heap_dump(struct malloc_heap *heap, FILE *f) 113699a2dd95SBruce Richardson { 113799a2dd95SBruce Richardson struct malloc_elem *elem; 113899a2dd95SBruce Richardson 113999a2dd95SBruce Richardson rte_spinlock_lock(&heap->lock); 114099a2dd95SBruce Richardson 114199a2dd95SBruce Richardson fprintf(f, "Heap size: 0x%zx\n", heap->total_size); 114299a2dd95SBruce Richardson fprintf(f, "Heap alloc count: %u\n", heap->alloc_count); 114399a2dd95SBruce Richardson 114499a2dd95SBruce Richardson elem = heap->first; 114599a2dd95SBruce Richardson while (elem) { 114699a2dd95SBruce Richardson malloc_elem_dump(elem, f); 114799a2dd95SBruce Richardson elem = elem->next; 114899a2dd95SBruce Richardson } 114999a2dd95SBruce Richardson 115099a2dd95SBruce Richardson rte_spinlock_unlock(&heap->lock); 115199a2dd95SBruce Richardson } 115299a2dd95SBruce Richardson 115399a2dd95SBruce Richardson static int 115499a2dd95SBruce Richardson destroy_elem(struct malloc_elem *elem, size_t len) 115599a2dd95SBruce Richardson { 115699a2dd95SBruce Richardson struct malloc_heap *heap = elem->heap; 115799a2dd95SBruce Richardson 115899a2dd95SBruce Richardson /* notify all subscribers that a memory area is going to be removed */ 115999a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, elem, len); 116099a2dd95SBruce Richardson 116199a2dd95SBruce Richardson /* this element can be removed */ 116299a2dd95SBruce Richardson malloc_elem_free_list_remove(elem); 116399a2dd95SBruce Richardson malloc_elem_hide_region(elem, elem, len); 116499a2dd95SBruce Richardson 116599a2dd95SBruce Richardson heap->total_size -= len; 116699a2dd95SBruce Richardson 116799a2dd95SBruce Richardson memset(elem, 0, sizeof(*elem)); 116899a2dd95SBruce Richardson 116999a2dd95SBruce Richardson return 0; 117099a2dd95SBruce Richardson } 117199a2dd95SBruce Richardson 117299a2dd95SBruce Richardson struct rte_memseg_list * 117399a2dd95SBruce Richardson malloc_heap_create_external_seg(void *va_addr, rte_iova_t iova_addrs[], 117499a2dd95SBruce Richardson unsigned int n_pages, size_t page_sz, const char *seg_name, 117599a2dd95SBruce Richardson unsigned int socket_id) 117699a2dd95SBruce Richardson { 117799a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 117899a2dd95SBruce Richardson char fbarray_name[RTE_FBARRAY_NAME_LEN]; 117999a2dd95SBruce Richardson struct rte_memseg_list *msl = NULL; 118099a2dd95SBruce Richardson struct rte_fbarray *arr; 118199a2dd95SBruce Richardson size_t seg_len = n_pages * page_sz; 118299a2dd95SBruce Richardson unsigned int i; 118399a2dd95SBruce Richardson 118499a2dd95SBruce Richardson /* first, find a free memseg list */ 118599a2dd95SBruce Richardson for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { 118699a2dd95SBruce Richardson struct rte_memseg_list *tmp = &mcfg->memsegs[i]; 118799a2dd95SBruce Richardson if (tmp->base_va == NULL) { 118899a2dd95SBruce Richardson msl = tmp; 118999a2dd95SBruce Richardson break; 119099a2dd95SBruce Richardson } 119199a2dd95SBruce Richardson } 119299a2dd95SBruce Richardson if (msl == NULL) { 119399a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Couldn't find empty memseg list\n"); 119499a2dd95SBruce Richardson rte_errno = ENOSPC; 119599a2dd95SBruce Richardson return NULL; 119699a2dd95SBruce Richardson } 119799a2dd95SBruce Richardson 119899a2dd95SBruce Richardson snprintf(fbarray_name, sizeof(fbarray_name), "%s_%p", 119999a2dd95SBruce Richardson seg_name, va_addr); 120099a2dd95SBruce Richardson 120199a2dd95SBruce Richardson /* create the backing fbarray */ 120299a2dd95SBruce Richardson if (rte_fbarray_init(&msl->memseg_arr, fbarray_name, n_pages, 120399a2dd95SBruce Richardson sizeof(struct rte_memseg)) < 0) { 120499a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Couldn't create fbarray backing the memseg list\n"); 120599a2dd95SBruce Richardson return NULL; 120699a2dd95SBruce Richardson } 120799a2dd95SBruce Richardson arr = &msl->memseg_arr; 120899a2dd95SBruce Richardson 120999a2dd95SBruce Richardson /* fbarray created, fill it up */ 121099a2dd95SBruce Richardson for (i = 0; i < n_pages; i++) { 121199a2dd95SBruce Richardson struct rte_memseg *ms; 121299a2dd95SBruce Richardson 121399a2dd95SBruce Richardson rte_fbarray_set_used(arr, i); 121499a2dd95SBruce Richardson ms = rte_fbarray_get(arr, i); 121599a2dd95SBruce Richardson ms->addr = RTE_PTR_ADD(va_addr, i * page_sz); 121699a2dd95SBruce Richardson ms->iova = iova_addrs == NULL ? RTE_BAD_IOVA : iova_addrs[i]; 121799a2dd95SBruce Richardson ms->hugepage_sz = page_sz; 121899a2dd95SBruce Richardson ms->len = page_sz; 121999a2dd95SBruce Richardson ms->nchannel = rte_memory_get_nchannel(); 122099a2dd95SBruce Richardson ms->nrank = rte_memory_get_nrank(); 122199a2dd95SBruce Richardson ms->socket_id = socket_id; 122299a2dd95SBruce Richardson } 122399a2dd95SBruce Richardson 122499a2dd95SBruce Richardson /* set up the memseg list */ 122599a2dd95SBruce Richardson msl->base_va = va_addr; 122699a2dd95SBruce Richardson msl->page_sz = page_sz; 122799a2dd95SBruce Richardson msl->socket_id = socket_id; 122899a2dd95SBruce Richardson msl->len = seg_len; 122999a2dd95SBruce Richardson msl->version = 0; 123099a2dd95SBruce Richardson msl->external = 1; 123199a2dd95SBruce Richardson 123299a2dd95SBruce Richardson return msl; 123399a2dd95SBruce Richardson } 123499a2dd95SBruce Richardson 123599a2dd95SBruce Richardson struct extseg_walk_arg { 123699a2dd95SBruce Richardson void *va_addr; 123799a2dd95SBruce Richardson size_t len; 123899a2dd95SBruce Richardson struct rte_memseg_list *msl; 123999a2dd95SBruce Richardson }; 124099a2dd95SBruce Richardson 124199a2dd95SBruce Richardson static int 124299a2dd95SBruce Richardson extseg_walk(const struct rte_memseg_list *msl, void *arg) 124399a2dd95SBruce Richardson { 124499a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 124599a2dd95SBruce Richardson struct extseg_walk_arg *wa = arg; 124699a2dd95SBruce Richardson 124799a2dd95SBruce Richardson if (msl->base_va == wa->va_addr && msl->len == wa->len) { 124899a2dd95SBruce Richardson unsigned int found_idx; 124999a2dd95SBruce Richardson 125099a2dd95SBruce Richardson /* msl is const */ 125199a2dd95SBruce Richardson found_idx = msl - mcfg->memsegs; 125299a2dd95SBruce Richardson wa->msl = &mcfg->memsegs[found_idx]; 125399a2dd95SBruce Richardson return 1; 125499a2dd95SBruce Richardson } 125599a2dd95SBruce Richardson return 0; 125699a2dd95SBruce Richardson } 125799a2dd95SBruce Richardson 125899a2dd95SBruce Richardson struct rte_memseg_list * 125999a2dd95SBruce Richardson malloc_heap_find_external_seg(void *va_addr, size_t len) 126099a2dd95SBruce Richardson { 126199a2dd95SBruce Richardson struct extseg_walk_arg wa; 126299a2dd95SBruce Richardson int res; 126399a2dd95SBruce Richardson 126499a2dd95SBruce Richardson wa.va_addr = va_addr; 126599a2dd95SBruce Richardson wa.len = len; 126699a2dd95SBruce Richardson 126799a2dd95SBruce Richardson res = rte_memseg_list_walk_thread_unsafe(extseg_walk, &wa); 126899a2dd95SBruce Richardson 126999a2dd95SBruce Richardson if (res != 1) { 127099a2dd95SBruce Richardson /* 0 means nothing was found, -1 shouldn't happen */ 127199a2dd95SBruce Richardson if (res == 0) 127299a2dd95SBruce Richardson rte_errno = ENOENT; 127399a2dd95SBruce Richardson return NULL; 127499a2dd95SBruce Richardson } 127599a2dd95SBruce Richardson return wa.msl; 127699a2dd95SBruce Richardson } 127799a2dd95SBruce Richardson 127899a2dd95SBruce Richardson int 127999a2dd95SBruce Richardson malloc_heap_destroy_external_seg(struct rte_memseg_list *msl) 128099a2dd95SBruce Richardson { 128199a2dd95SBruce Richardson /* destroy the fbarray backing this memory */ 128299a2dd95SBruce Richardson if (rte_fbarray_destroy(&msl->memseg_arr) < 0) 128399a2dd95SBruce Richardson return -1; 128499a2dd95SBruce Richardson 128599a2dd95SBruce Richardson /* reset the memseg list */ 128699a2dd95SBruce Richardson memset(msl, 0, sizeof(*msl)); 128799a2dd95SBruce Richardson 128899a2dd95SBruce Richardson return 0; 128999a2dd95SBruce Richardson } 129099a2dd95SBruce Richardson 129199a2dd95SBruce Richardson int 129299a2dd95SBruce Richardson malloc_heap_add_external_memory(struct malloc_heap *heap, 129399a2dd95SBruce Richardson struct rte_memseg_list *msl) 129499a2dd95SBruce Richardson { 129599a2dd95SBruce Richardson /* erase contents of new memory */ 129699a2dd95SBruce Richardson memset(msl->base_va, 0, msl->len); 129799a2dd95SBruce Richardson 129899a2dd95SBruce Richardson /* now, add newly minted memory to the malloc heap */ 12992edd037cSDmitry Kozlyuk malloc_heap_add_memory(heap, msl, msl->base_va, msl->len, false); 130099a2dd95SBruce Richardson 130199a2dd95SBruce Richardson heap->total_size += msl->len; 130299a2dd95SBruce Richardson 130399a2dd95SBruce Richardson /* all done! */ 130499a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Added segment for heap %s starting at %p\n", 130599a2dd95SBruce Richardson heap->name, msl->base_va); 130699a2dd95SBruce Richardson 130799a2dd95SBruce Richardson /* notify all subscribers that a new memory area has been added */ 130899a2dd95SBruce Richardson eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, 130999a2dd95SBruce Richardson msl->base_va, msl->len); 131099a2dd95SBruce Richardson 131199a2dd95SBruce Richardson return 0; 131299a2dd95SBruce Richardson } 131399a2dd95SBruce Richardson 131499a2dd95SBruce Richardson int 131599a2dd95SBruce Richardson malloc_heap_remove_external_memory(struct malloc_heap *heap, void *va_addr, 131699a2dd95SBruce Richardson size_t len) 131799a2dd95SBruce Richardson { 131899a2dd95SBruce Richardson struct malloc_elem *elem = heap->first; 131999a2dd95SBruce Richardson 132099a2dd95SBruce Richardson /* find element with specified va address */ 132199a2dd95SBruce Richardson while (elem != NULL && elem != va_addr) { 132299a2dd95SBruce Richardson elem = elem->next; 132399a2dd95SBruce Richardson /* stop if we've blown past our VA */ 132499a2dd95SBruce Richardson if (elem > (struct malloc_elem *)va_addr) { 132599a2dd95SBruce Richardson rte_errno = ENOENT; 132699a2dd95SBruce Richardson return -1; 132799a2dd95SBruce Richardson } 132899a2dd95SBruce Richardson } 132999a2dd95SBruce Richardson /* check if element was found */ 133099a2dd95SBruce Richardson if (elem == NULL || elem->msl->len != len) { 133199a2dd95SBruce Richardson rte_errno = ENOENT; 133299a2dd95SBruce Richardson return -1; 133399a2dd95SBruce Richardson } 133499a2dd95SBruce Richardson /* if element's size is not equal to segment len, segment is busy */ 133599a2dd95SBruce Richardson if (elem->state == ELEM_BUSY || elem->size != len) { 133699a2dd95SBruce Richardson rte_errno = EBUSY; 133799a2dd95SBruce Richardson return -1; 133899a2dd95SBruce Richardson } 133999a2dd95SBruce Richardson return destroy_elem(elem, len); 134099a2dd95SBruce Richardson } 134199a2dd95SBruce Richardson 134299a2dd95SBruce Richardson int 134399a2dd95SBruce Richardson malloc_heap_create(struct malloc_heap *heap, const char *heap_name) 134499a2dd95SBruce Richardson { 134599a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 134699a2dd95SBruce Richardson uint32_t next_socket_id = mcfg->next_socket_id; 134799a2dd95SBruce Richardson 134899a2dd95SBruce Richardson /* prevent overflow. did you really create 2 billion heaps??? */ 134999a2dd95SBruce Richardson if (next_socket_id > INT32_MAX) { 135099a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n"); 135199a2dd95SBruce Richardson rte_errno = ENOSPC; 135299a2dd95SBruce Richardson return -1; 135399a2dd95SBruce Richardson } 135499a2dd95SBruce Richardson 135599a2dd95SBruce Richardson /* initialize empty heap */ 135699a2dd95SBruce Richardson heap->alloc_count = 0; 135799a2dd95SBruce Richardson heap->first = NULL; 135899a2dd95SBruce Richardson heap->last = NULL; 135999a2dd95SBruce Richardson LIST_INIT(heap->free_head); 136099a2dd95SBruce Richardson rte_spinlock_init(&heap->lock); 136199a2dd95SBruce Richardson heap->total_size = 0; 136299a2dd95SBruce Richardson heap->socket_id = next_socket_id; 136399a2dd95SBruce Richardson 136499a2dd95SBruce Richardson /* we hold a global mem hotplug writelock, so it's safe to increment */ 136599a2dd95SBruce Richardson mcfg->next_socket_id++; 136699a2dd95SBruce Richardson 136799a2dd95SBruce Richardson /* set up name */ 136899a2dd95SBruce Richardson strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); 136999a2dd95SBruce Richardson return 0; 137099a2dd95SBruce Richardson } 137199a2dd95SBruce Richardson 137299a2dd95SBruce Richardson int 137399a2dd95SBruce Richardson malloc_heap_destroy(struct malloc_heap *heap) 137499a2dd95SBruce Richardson { 137599a2dd95SBruce Richardson if (heap->alloc_count != 0) { 137699a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Heap is still in use\n"); 137799a2dd95SBruce Richardson rte_errno = EBUSY; 137899a2dd95SBruce Richardson return -1; 137999a2dd95SBruce Richardson } 138099a2dd95SBruce Richardson if (heap->first != NULL || heap->last != NULL) { 138199a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Heap still contains memory segments\n"); 138299a2dd95SBruce Richardson rte_errno = EBUSY; 138399a2dd95SBruce Richardson return -1; 138499a2dd95SBruce Richardson } 138599a2dd95SBruce Richardson if (heap->total_size != 0) 138699a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Total size not zero, heap is likely corrupt\n"); 138799a2dd95SBruce Richardson 138899a2dd95SBruce Richardson /* after this, the lock will be dropped */ 138999a2dd95SBruce Richardson memset(heap, 0, sizeof(*heap)); 139099a2dd95SBruce Richardson 139199a2dd95SBruce Richardson return 0; 139299a2dd95SBruce Richardson } 139399a2dd95SBruce Richardson 139499a2dd95SBruce Richardson int 139599a2dd95SBruce Richardson rte_eal_malloc_heap_init(void) 139699a2dd95SBruce Richardson { 139799a2dd95SBruce Richardson struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; 139899a2dd95SBruce Richardson unsigned int i; 139999a2dd95SBruce Richardson const struct internal_config *internal_conf = 140099a2dd95SBruce Richardson eal_get_internal_configuration(); 140199a2dd95SBruce Richardson 140299a2dd95SBruce Richardson if (internal_conf->match_allocations) 140399a2dd95SBruce Richardson RTE_LOG(DEBUG, EAL, "Hugepages will be freed exactly as allocated.\n"); 140499a2dd95SBruce Richardson 140599a2dd95SBruce Richardson if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 140699a2dd95SBruce Richardson /* assign min socket ID to external heaps */ 140799a2dd95SBruce Richardson mcfg->next_socket_id = EXTERNAL_HEAP_MIN_SOCKET_ID; 140899a2dd95SBruce Richardson 140999a2dd95SBruce Richardson /* assign names to default DPDK heaps */ 141099a2dd95SBruce Richardson for (i = 0; i < rte_socket_count(); i++) { 141199a2dd95SBruce Richardson struct malloc_heap *heap = &mcfg->malloc_heaps[i]; 141299a2dd95SBruce Richardson char heap_name[RTE_HEAP_NAME_MAX_LEN]; 141399a2dd95SBruce Richardson int socket_id = rte_socket_id_by_idx(i); 141499a2dd95SBruce Richardson 141599a2dd95SBruce Richardson snprintf(heap_name, sizeof(heap_name), 141699a2dd95SBruce Richardson "socket_%i", socket_id); 141799a2dd95SBruce Richardson strlcpy(heap->name, heap_name, RTE_HEAP_NAME_MAX_LEN); 141899a2dd95SBruce Richardson heap->socket_id = socket_id; 141999a2dd95SBruce Richardson } 142099a2dd95SBruce Richardson } 142199a2dd95SBruce Richardson 142299a2dd95SBruce Richardson 142399a2dd95SBruce Richardson if (register_mp_requests()) { 142499a2dd95SBruce Richardson RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n"); 142599a2dd95SBruce Richardson rte_mcfg_mem_read_unlock(); 142699a2dd95SBruce Richardson return -1; 142799a2dd95SBruce Richardson } 142899a2dd95SBruce Richardson 142999a2dd95SBruce Richardson /* unlock mem hotplug here. it's safe for primary as no requests can 143099a2dd95SBruce Richardson * even come before primary itself is fully initialized, and secondaries 143199a2dd95SBruce Richardson * do not need to initialize the heap. 143299a2dd95SBruce Richardson */ 143399a2dd95SBruce Richardson rte_mcfg_mem_read_unlock(); 143499a2dd95SBruce Richardson 143599a2dd95SBruce Richardson /* secondary process does not need to initialize anything */ 143699a2dd95SBruce Richardson if (rte_eal_process_type() != RTE_PROC_PRIMARY) 143799a2dd95SBruce Richardson return 0; 143899a2dd95SBruce Richardson 143999a2dd95SBruce Richardson /* add all IOVA-contiguous areas to the heap */ 144099a2dd95SBruce Richardson return rte_memseg_contig_walk(malloc_add_seg, NULL); 144199a2dd95SBruce Richardson } 1442a0cc7be2SStephen Hemminger 1443a0cc7be2SStephen Hemminger void 1444a0cc7be2SStephen Hemminger rte_eal_malloc_heap_cleanup(void) 1445a0cc7be2SStephen Hemminger { 1446a0cc7be2SStephen Hemminger unregister_mp_requests(); 1447a0cc7be2SStephen Hemminger } 1448