@@ -132,7 +132,7 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
struct rte_memzone *mz;
struct rte_mem_config *mcfg;
size_t requested_len;
- int socket, i;
+ int socket;
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
@@ -216,21 +216,8 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
socket = socket_id;
/* allocate memory on heap */
- void *mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[socket], NULL,
- requested_len, flags, align, bound);
-
- if ((mz_addr == NULL) && (socket_id == SOCKET_ID_ANY)) {
- /* try other heaps */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
- if (socket == i)
- continue;
-
- mz_addr = malloc_heap_alloc(&mcfg->malloc_heaps[i],
- NULL, requested_len, flags, align, bound);
- if (mz_addr != NULL)
- break;
- }
- }
+ void *mz_addr = malloc_heap_alloc(NULL, requested_len, socket, flags,
+ align, bound);
if (mz_addr == NULL) {
rte_errno = ENOMEM;
@@ -269,8 +269,8 @@ malloc_elem_free_list_insert(struct malloc_elem *elem)
/*
* Remove the specified element from its heap's free list.
*/
-static void
-elem_free_list_remove(struct malloc_elem *elem)
+void
+malloc_elem_free_list_remove(struct malloc_elem *elem)
{
LIST_REMOVE(elem, free_list);
}
@@ -290,7 +290,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size, unsigned align,
const size_t trailer_size = elem->size - old_elem_size - size -
MALLOC_ELEM_OVERHEAD;
- elem_free_list_remove(elem);
+ malloc_elem_free_list_remove(elem);
if (trailer_size > MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
/* split it, too much free space after elem */
@@ -363,7 +363,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) {
erase = RTE_PTR_SUB(elem->next, MALLOC_ELEM_TRAILER_LEN);
/* remove from free list, join to this one */
- elem_free_list_remove(elem->next);
+ malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);
/* erase header and trailer */
@@ -383,7 +383,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) {
erase = RTE_PTR_SUB(elem, MALLOC_ELEM_TRAILER_LEN);
/* remove from free list, join to this one */
- elem_free_list_remove(elem->prev);
+ malloc_elem_free_list_remove(elem->prev);
new_elem = elem->prev;
join_elem(new_elem, elem);
@@ -402,7 +402,7 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) {
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem)
{
void *ptr;
@@ -420,7 +420,87 @@ malloc_elem_free(struct malloc_elem *elem)
memset(ptr, 0, data_len);
- return 0;
+ return elem;
+}
+
+/* assume all checks were already done */
+void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len) {
+ size_t len_before, len_after;
+ struct malloc_elem *prev, *next;
+ void *end, *elem_end;
+
+ end = RTE_PTR_ADD(start, len);
+ elem_end = RTE_PTR_ADD(elem, elem->size);
+ len_before = RTE_PTR_DIFF(start, elem);
+ len_after = RTE_PTR_DIFF(elem_end, end);
+
+ prev = elem->prev;
+ next = elem->next;
+
+ if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split after */
+ struct malloc_elem *split_after = end;
+
+ split_elem(elem, split_after);
+
+ next = split_after;
+
+ malloc_elem_free_list_insert(split_after);
+ } else if (len_after >= MALLOC_ELEM_HEADER_LEN) {
+ struct malloc_elem *pad_elem = end;
+
+ /* shrink current element */
+ elem->size -= len_after;
+ memset(pad_elem, 0, sizeof(*pad_elem));
+
+ /* copy next element's data to our pad */
+ memcpy(pad_elem, next, sizeof(*pad_elem));
+
+ /* pad next element */
+ next->state = ELEM_PAD;
+ next->pad = len_after;
+
+ /* next element is busy, would've been merged otherwise */
+ pad_elem->pad = len_after;
+ pad_elem->size += len_after;
+ } else if (len_after > 0) {
+ rte_panic("Unaligned element, heap is probably corrupt\n");
+ }
+
+ if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split before */
+ struct malloc_elem *split_before = start;
+
+ split_elem(elem, split_before);
+
+ prev = elem;
+ elem = split_before;
+
+ malloc_elem_free_list_insert(prev);
+ } else if (len_before > 0) {
+ /*
+ * unlike with elements after current, here we don't need to
+ * pad elements, but rather just increase the size of previous
+ * element, copy the old header and and set up trailer.
+ */
+ void *trailer = RTE_PTR_ADD(prev,
+ prev->size - MALLOC_ELEM_TRAILER_LEN);
+ struct malloc_elem *new_elem = start;
+
+ memcpy(new_elem, elem, sizeof(*elem));
+ new_elem->size -= len_before;
+
+ prev->size += len_before;
+ set_trailer(prev);
+
+ elem = new_elem;
+
+ /* erase old trailer */
+ memset(trailer, 0, MALLOC_ELEM_TRAILER_LEN);
+ }
+
+ remove_elem(elem);
}
/*
@@ -446,7 +526,7 @@ malloc_elem_resize(struct malloc_elem *elem, size_t size)
/* we now know the element fits, so remove from free list,
* join the two
*/
- elem_free_list_remove(elem->next);
+ malloc_elem_free_list_remove(elem->next);
join_elem(elem, elem->next);
if (elem->size - new_size >= MIN_DATA_SIZE + MALLOC_ELEM_OVERHEAD) {
@@ -164,7 +164,7 @@ malloc_elem_alloc(struct malloc_elem *elem, size_t size,
* blocks either immediately before or immediately after newly freed block
* are also free, the blocks are merged together.
*/
-int
+struct malloc_elem *
malloc_elem_free(struct malloc_elem *elem);
struct malloc_elem *
@@ -177,6 +177,12 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem);
int
malloc_elem_resize(struct malloc_elem *elem, size_t size);
+void
+malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len);
+
+void
+malloc_elem_free_list_remove(struct malloc_elem *elem);
+
/*
* Given an element size, compute its freelist index.
*/
@@ -49,8 +49,10 @@
#include <rte_spinlock.h>
#include <rte_memcpy.h>
#include <rte_atomic.h>
+#include <rte_fbarray.h>
#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"
@@ -151,46 +153,304 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
* scan fails. Once the new memseg is added, it re-scans and should return
* the new element after releasing the lock.
*/
-void *
-malloc_heap_alloc(struct malloc_heap *heap,
- const char *type __attribute__((unused)), size_t size, unsigned flags,
- size_t align, size_t bound)
+static void *
+heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
+ unsigned flags, size_t align, size_t bound)
{
struct malloc_elem *elem;
size = RTE_CACHE_LINE_ROUNDUP(size);
align = RTE_CACHE_LINE_ROUNDUP(align);
- rte_spinlock_lock(&heap->lock);
-
elem = find_suitable_element(heap, size, flags, align, bound);
if (elem != NULL) {
elem = malloc_elem_alloc(elem, size, align, bound);
+
/* increase heap's count of allocated elements */
heap->alloc_count++;
}
- rte_spinlock_unlock(&heap->lock);
return elem == NULL ? NULL : (void *)(&elem[1]);
}
+static void *
+try_expand_heap(struct malloc_heap *heap, struct rte_memseg_list *msl,
+ const char *type, size_t size, int socket, unsigned flags,
+ size_t align, size_t bound) {
+ struct malloc_elem *elem;
+ struct rte_memseg **ms;
+ size_t map_len;
+ void *map_addr;
+ int i, n_pages, allocd_pages;
+ void *ret;
+
+ align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
+ map_len = RTE_ALIGN_CEIL(align + size + MALLOC_ELEM_TRAILER_LEN,
+ msl->hugepage_sz);
+
+ n_pages = map_len / msl->hugepage_sz;
+
+ /* we can't know in advance how many pages we'll need, so malloc */
+ ms = malloc(sizeof(*ms) * n_pages);
+
+ allocd_pages = eal_memalloc_alloc_page_bulk(ms, n_pages,
+ msl->hugepage_sz, socket, true);
+
+ /* make sure we've allocated our pages... */
+ if (allocd_pages != n_pages)
+ goto free_ms;
+
+ map_addr = ms[0]->addr;
+
+ /* add newly minted memsegs to malloc heap */
+ elem = malloc_heap_add_memory(heap, msl, map_addr, map_len);
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ msl->socket_id, map_len >> 20ULL);
+
+ /* try once more, as now we have allocated new memory */
+ ret = heap_alloc(heap, type, size, flags,
+ align == 0 ? 1 : align, bound);
+
+ if (ret == NULL)
+ goto free_elem;
+
+ free(ms);
+ return ret;
+
+free_elem:
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, map_addr, map_len);
+ heap->total_size -= map_len;
+
+ RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate, so shrinking heap on socket %d by %zdMB\n",
+ __func__, socket, map_len >> 20ULL);
+
+ for (i = 0; i < n_pages; i++) {
+ eal_memalloc_free_page(ms[i]);
+ }
+free_ms:
+ free(ms);
+ return NULL;
+}
+
+static int
+compare_pagesz(const void *a, const void *b) {
+ const struct rte_memseg_list *msla = a;
+ const struct rte_memseg_list *mslb = b;
+
+ if (msla->hugepage_sz < mslb->hugepage_sz)
+ return 1;
+ if (msla->hugepage_sz > mslb->hugepage_sz)
+ return -1;
+ return 0;
+}
+
+/* this will try lower page sizes first */
+static void *
+heap_alloc_on_socket(const char *type, size_t size, int socket,
+ unsigned flags, size_t align, size_t bound) {
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
+ struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS];
+ struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS];
+ int i, n_other_msls = 0, n_requested_msls = 0;
+ bool size_hint = (flags & RTE_MEMZONE_SIZE_HINT_ONLY) > 0;
+ unsigned size_flags = flags & ~RTE_MEMZONE_SIZE_HINT_ONLY;
+ void *ret;
+
+ rte_spinlock_lock(&(heap->lock));
+
+ /* for legacy mode, try once and with all flags */
+ if (internal_config.legacy_mem) {
+ ret = heap_alloc(heap, type, size, flags,
+ align == 0 ? 1 : align, bound);
+ goto alloc_unlock;
+ }
+
+ /*
+ * we do not pass the size hint here, because even if allocation fails,
+ * we may still be able to allocate memory from appropriate page sizes,
+ * we just need to request more memory first.
+ */
+ ret = heap_alloc(heap, type, size, size_flags, align == 0 ? 1 : align,
+ bound);
+ if (ret != NULL)
+ goto alloc_unlock;
+
+ memset(requested_msls, 0, sizeof(requested_msls));
+ memset(other_msls, 0, sizeof(other_msls));
+
+ /*
+ * go through memseg list and take note of all the page sizes available,
+ * and if any of them were specifically requested by the user.
+ */
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->socket_id != socket)
+ continue;
+
+ if (msl->base_va == NULL)
+ continue;
+
+ /* if pages of specific size were requested */
+ if (size_flags != 0 && check_hugepage_sz(size_flags,
+ msl->hugepage_sz)) {
+ requested_msls[n_requested_msls++] = msl;
+ } else if (size_flags == 0 || size_hint) {
+ other_msls[n_other_msls++] = msl;
+ }
+ }
+
+ /* sort the lists, smallest first */
+ qsort(requested_msls, n_requested_msls, sizeof(requested_msls[0]),
+ compare_pagesz);
+ qsort(other_msls, n_other_msls, sizeof(other_msls[0]),
+ compare_pagesz);
+
+ for (i = 0; i < n_requested_msls; i++) {
+ struct rte_memseg_list *msl = requested_msls[i];
+
+ /*
+ * do not pass the size hint here, as user expects other page
+ * sizes first, before resorting to best effort allocation.
+ */
+ ret = try_expand_heap(heap, msl, type, size, socket, size_flags,
+ align, bound);
+ if (ret != NULL)
+ goto alloc_unlock;
+ }
+ if (n_other_msls == 0)
+ goto alloc_unlock;
+
+ /* now, try reserving with size hint */
+ ret = heap_alloc(heap, type, size, flags, align == 0 ? 1 : align,
+ bound);
+ if (ret != NULL)
+ goto alloc_unlock;
+
+ /*
+ * we still couldn't reserve memory, so try expanding heap with other
+ * page sizes, if there are any
+ */
+ for (i = 0; i < n_other_msls; i++) {
+ struct rte_memseg_list *msl = other_msls[i];
+
+ ret = try_expand_heap(heap, msl, type, size, socket, flags,
+ align, bound);
+ if (ret != NULL)
+ goto alloc_unlock;
+ }
+alloc_unlock:
+ rte_spinlock_unlock(&(heap->lock));
+ return ret;
+}
+
+void *
+malloc_heap_alloc(const char *type, size_t size, int socket_arg, unsigned flags,
+ size_t align, size_t bound) {
+ int socket, i;
+ void *ret;
+
+ /* return NULL if size is 0 or alignment is not power-of-2 */
+ if (size == 0 || (align && !rte_is_power_of_2(align)))
+ return NULL;
+
+ if (!rte_eal_has_hugepages())
+ socket_arg = SOCKET_ID_ANY;
+
+ if (socket_arg == SOCKET_ID_ANY)
+ socket = malloc_get_numa_socket();
+ else
+ socket = socket_arg;
+
+ /* Check socket parameter */
+ if (socket >= RTE_MAX_NUMA_NODES)
+ return NULL;
+
+ // TODO: add warning for alignments bigger than page size if not VFIO
+
+ ret = heap_alloc_on_socket(type, size, socket, flags, align, bound);
+ if (ret != NULL || socket_arg != SOCKET_ID_ANY)
+ return ret;
+
+ /* try other heaps */
+ for (i = 0; i < (int) rte_num_sockets(); i++) {
+ if (i == socket)
+ continue;
+ ret = heap_alloc_on_socket(type, size, socket, flags,
+ align, bound);
+ if (ret != NULL)
+ return ret;
+ }
+ return NULL;
+}
+
int
malloc_heap_free(struct malloc_elem *elem) {
struct malloc_heap *heap;
- int ret;
+ void *start, *aligned_start, *end, *aligned_end;
+ size_t len, aligned_len;
+ const struct rte_memseg_list *msl;
+ int n_pages, page_idx, max_page_idx, ret;
if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
/* elem may be merged with previous element, so keep heap address */
heap = elem->heap;
+ msl = elem->msl;
rte_spinlock_lock(&(heap->lock));
- ret = malloc_elem_free(elem);
+ elem = malloc_elem_free(elem);
- rte_spinlock_unlock(&(heap->lock));
+ /* anything after this is a bonus */
+ ret = 0;
+
+ /* ...of which we can't avail if we are in legacy mode */
+ if (internal_config.legacy_mem)
+ goto free_unlock;
+
+ /* check if we can free any memory back to the system */
+ if (elem->size < msl->hugepage_sz)
+ goto free_unlock;
+
+ /* probably, but let's make sure, as we may not be using up full page */
+ start = elem;
+ len = elem->size;
+ aligned_start = RTE_PTR_ALIGN_CEIL(start, msl->hugepage_sz);
+ end = RTE_PTR_ADD(elem, len);
+ aligned_end = RTE_PTR_ALIGN_FLOOR(end, msl->hugepage_sz);
+ aligned_len = RTE_PTR_DIFF(aligned_end, aligned_start);
+
+ /* can't free anything */
+ if (aligned_len < msl->hugepage_sz)
+ goto free_unlock;
+
+ malloc_elem_free_list_remove(elem);
+
+ malloc_elem_hide_region(elem, (void*) aligned_start, aligned_len);
+
+ /* we don't really care if we fail to deallocate memory */
+ n_pages = aligned_len / msl->hugepage_sz;
+ page_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / msl->hugepage_sz;
+ max_page_idx = page_idx + n_pages;
+
+ for (; page_idx < max_page_idx; page_idx++) {
+ struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(&msl->memseg_arr, page_idx);
+ eal_memalloc_free_page(ms);
+ heap->total_size -= msl->hugepage_sz;
+ }
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n",
+ msl->socket_id, aligned_len >> 20ULL);
+free_unlock:
+ rte_spinlock_unlock(&(heap->lock));
return ret;
}
@@ -53,8 +53,8 @@ malloc_get_numa_socket(void)
}
void *
-malloc_heap_alloc(struct malloc_heap *heap, const char *type, size_t size,
- unsigned flags, size_t align, size_t bound);
+malloc_heap_alloc(const char *type, size_t size, int socket, unsigned flags,
+ size_t align, size_t bound);
int
malloc_heap_free(struct malloc_elem *elem);
@@ -68,9 +68,7 @@ void rte_free(void *addr)
void *
rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int socket, i;
- void *ret;
+ int socket;
/* return NULL if size is 0 or alignment is not power-of-2 */
if (size == 0 || (align && !rte_is_power_of_2(align)))
@@ -88,24 +86,8 @@ rte_malloc_socket(const char *type, size_t size, unsigned align, int socket_arg)
if (socket >= RTE_MAX_NUMA_NODES)
return NULL;
- ret = malloc_heap_alloc(&mcfg->malloc_heaps[socket], type,
- size, 0, align == 0 ? 1 : align, 0);
- if (ret != NULL || socket_arg != SOCKET_ID_ANY)
- return ret;
-
- /* try other heaps */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
- /* we already tried this one */
- if (i == socket)
- continue;
-
- ret = malloc_heap_alloc(&mcfg->malloc_heaps[i], type,
- size, 0, align == 0 ? 1 : align, 0);
- if (ret != NULL)
- return ret;
- }
-
- return NULL;
+ return malloc_heap_alloc(type, size, socket_arg, 0,
+ align == 0 ? 1 : align, 0);
}
/*