Baillehache Pascal's personal website

Optimisation using memory pool

This article introduces the results of a quick experiment to check how memory pool can be used to optimise performance.

Implementing an algorithm by using dynamically allocated memory can be slower than by using statically allocated memory for (at least?) 2 reasons.

The dynamic allocation needs to find contiguous space in the heap and maintain information about the allocated memory. Statically allocated variables are either in the data segment setup at compile time and ready to use from the beginning of the execution, or in the stack where memory management is straightforward.
Dynamically allocated memory needs to be freed, a time consuming process that's not needed (or reduces to stack unwinding) for statically allocated memory.

Hence, when speed performance is desirable statically allocated memory should be preferred to dynamically allocated one. If the amount of needed memory is known the implementation is straightforward. If not, one could statically allocate for each variable an amount of memory larger than what will ever be used. This is only possible if an upper bound can be determined, threaten to crash or bug if that upper bound was mistakenly too low, and generally consumes more memory than necessary.

Another solution is to pre-allocate memory and redefined the allocation/free operations to use and reuse that memory. It can be made more efficient than conventional dynamic memory allocation by constraining the size of the allocated memory. Depending on the details of the implementation it is known as memory pools, object pools, or slab allocation. The more often a block of memory is allocated/freed, the more advantageous this solution becomes.

To try it, I've implemented a memory pool as follow. The pool is a double linked list of a given data structure (let's call it T). Upon creation the pool is an empty list. When the user requests the allocation of an instance of T, if there is a free one in the list it is returned, else a new one is dynamically allocated, added to the list, and returned. When the user requests the free-ing of an instance of T, it is not freed but instead marked as free to reuse. The list is kept ordered as follow: free instances are at the beginning of the list, currently used instances are at the end of the list. The memory pool implementation keeps a pointer to the list's head and tail. Then, allocating/freing memory is as simple as checking for the head or tail status, and moving one element from/to both ends of the list or adding a new element when the list is full. When the memory pool is freed, the remaining elements are actually freed. The two macros below allow to create such a memory pool for a given data structure.

#define MemoryPool(T)                                               \
  typedef struct T ## MemoryPool {                                  \
    T* head;                                                        \
    T* tail;                                                        \
  } T ## MemoryPool;                                                \
  void T ## MemoryPool ## Init(T ## MemoryPool* that) {             \
    that->head = that->tail = NULL;                                 \
  }                                                                 \
  void T ## MemoryPool ## Flush(T ## MemoryPool* that) {            \
    T* ptr = that->head;                                            \
    while(ptr) {                                                    \
      T* next = ptr->mempool.next;                                  \
      free(ptr);                                                    \
      ptr = next;                                                   \
    }                                                               \
    that->head = that->tail = NULL;                                 \
  }                                                                 \
  T* T ## MemoryPool ## Alloc(T ## MemoryPool* that) {              \
    T* ptr = that->head;                                            \
    if(ptr && ptr->mempool.isFree) {                                \
      if(ptr->mempool.next) {                                       \
        that->head = ptr->mempool.next;                             \
        ptr->mempool.next = ptr->mempool.next->mempool.prev = NULL; \
        ptr->mempool.prev = that->tail;                             \
        that->tail = that->tail->mempool.next = ptr;                \
      }                                                             \
    } else {                                                        \
      ptr = malloc(sizeof(T));                                      \
      assert(ptr);                                                  \
      *ptr = (T){ .mempool.prev=that->tail, .mempool.next=NULL };   \
      if(that->head == NULL) that->head = that->tail = ptr;         \
      else that->tail = that->tail->mempool.next = ptr;             \
    }                                                               \
    ptr->mempool.isFree = false;                                    \
    return ptr;                                                     \
  }                                                                 \
  void T ## MemoryPool ## Free(                                     \
    T ## MemoryPool* that, T* elem) {                               \
    elem->mempool.isFree = true;                                    \
    if(elem != that->head) {                                        \
      if(elem->mempool.prev) {                                      \
        elem->mempool.prev->mempool.next = elem->mempool.next;      \
      }                                                             \
      if(elem->mempool.next) {                                      \
        elem->mempool.next->mempool.prev = elem->mempool.prev;      \
      }                                                             \
      if(that->tail == elem) that->tail = elem->mempool.prev;       \
      elem->mempool.prev = NULL;                                    \
      elem->mempool.next = that->head;                              \
      that->head = that->head->mempool.prev = elem;                 \
    }                                                               \
  }

#define MemoryPoolFields(T)  \
struct {                     \
  T* prev;                   \
  T* next;                   \
  bool isFree;               \
} mempool

The macros can be used as follow to create a dummy Data structure and a memory pool for that structure:

typedef struct Data Data;
struct Data{
  double x;
  MemoryPoolFields(Data);
};
MemoryPool(Data)

I've tested the performance of that solution as follow. I run through Pascal's triangle and allocate/free as many instances of Data as the value at the current position in the triangle. Then I compare execution time for both malloc/free and the memory pool described above.

#include "LibCapy/capy.h"

#define NB_LOOP 100
#define N 10

uint64_t TestMalloc(void) {
  uint16_t arr[N];
  loop(i, N) arr[i] = 0;
  arr[0] = 1;
  uint64_t garbage = 0;
  loop(iLoop, NB_LOOP) {
    loop(i, N - 1) arr[N - 1 - i] += arr[N - 2 - i];
    loop(i, N) loop(j, arr[i]) {
      Data* data = malloc(sizeof(Data));
      data->x = i;
      garbage += data->x;
      free(data);
    }
  }
  return garbage;
}

uint64_t TestMemPool(void) {
  uint16_t arr[N];
  loop(i, N) arr[i] = 0;
  arr[0] = 1;
  DataMemoryPool memPool;
  DataMemoryPoolInit(&memPool);
  uint64_t garbage = 0;
  loop(iLoop, NB_LOOP) {
    loop(i, N - 1) arr[N - 1 - i] += arr[N - 2 - i];
    loop(i, N) loop(j, arr[i]) {
      Data* data = DataMemoryPoolAlloc(&memPool);
      data->x = i;
      garbage += data->x;
      DataMemoryPoolFree(&memPool, data);
    }
  }
  DataMemoryPoolFlush(&memPool);
  return garbage;
}

#define NB_TRY 100
#define ALGO 1
int main() {
  uint64_t s = 0.0;
  CapyChrono chrono = CapyChronoCreate();
  double delay[NB_TRY];
  loop(i, NB_TRY) {
    $(&chrono, start)();
#if ALGO==0
    s += TestMalloc();
#else
    s += TestMemPool();
#endif
    $(&chrono, stop)();
    delay[i] = $(&chrono, getElapsedTime)(capyChrono_second);
  }
  double avg = 0.0;
  loop(i, NB_TRY) avg += delay[i] / (double)NB_TRY;
  double sigma = 0.0;
  loop(i, NB_TRY) sigma += pow(delay[i] - avg, 2.0) / (double)NB_TRY;
  println("average time: %lf sigma: %lf", avg, sigma);
  return (s & 1);
}

Results are as follow:

using malloc/free:
average time: 0.244160 sigma: 0.000008
using memory pool:
average time: 0.108530 sigma: 0.000008

The version using memory pool appears to be around 2.2 times faster than the version using malloc/free.

2022-10-23
in All, C programming,
101 views
A comment, question, correction ? A project we could work together on ? Email me!
Learn more about me in my profile.