Lock-Free Memory Pool¶
Overview¶
Deterministic O(1) allocator for fixed-size objects (slabs). All memory is pre-allocated at initialization; alloc/free operations manipulate a lock-free LIFO freelist using compare-and-swap (CAS). Supports multi-producer multi-consumer (MPMC) multiple threads can allocate and free concurrently without locks.
Status: Implemented
Full implementation with CAS-based freelist, ABA prevention via generation counter, cache-line-aligned slabs, and allocation statistics.
Memory Layout¶
┌───────────────────────────────────────────────────────────────────┐
│ Pre-allocated contiguous buffer (hugepage-friendly): │
│ │
│ ┌────────────┬────────────┬────────────┬──── ──── ┬────────────┐ │
│ │ Slab 0 │ Slab 1 │ Slab 2 │ ... │ Slab N │ │
│ │ (64B+) │ (64B+) │ (64B+) │ │ (64B+) │ │
│ └─────┬──────┴─────┬──────┴─────┬──────┴──── ────┴────────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ Freelist (LIFO): │
│ head → [gen:17|idx:0] → [idx:1] → [idx:2] → ... → [NULL] │
│ │
│ Each free slab embeds a next-pointer (index) in its first bytes. │
│ When allocated, the slab content overwrites the next-pointer. │
└───────────────────────────────────────────────────────────────────┘
ABA Prevention¶
┌───────────────────────────────────────────────────────────────────┐
│ Tagged Pointer (64-bit atomic): │
│ │
│ ┌──────────────────────┬──────────────────────┐ │
│ │ Generation (32 bit) │ Slab Index (32 bit) │ │
│ └──────────────────────┴──────────────────────┘ │
│ Bits 63..32 Bits 31..0 │
│ │
│ ABA Problem: │
│ 1. Thread A reads head = [gen:5, idx:3] │
│ 2. Thread B pops idx:3, pushes idx:3 back (different content) │
│ 3. Thread A's CAS succeeds on stale data! │
│ │
│ Solution: increment generation on every CAS │
│ → Thread A sees [gen:7, idx:3] ≠ [gen:5, idx:3] → CAS fails │
└───────────────────────────────────────────────────────────────────┘
API Reference¶
Data Structures¶
typedef _Atomic uint64_t mpool_tagged_ptr_t;
#define MPOOL_NULL_INDEX UINT32_MAX
#define MPOOL_MAKE_TAG(gen, idx) (((uint64_t)(gen) << 32) | (uint64_t)(idx))
#define MPOOL_GET_GEN(tag) ((uint32_t)((tag) >> 32))
#define MPOOL_GET_IDX(tag) ((uint32_t)((tag) & 0xFFFFFFFF))
typedef struct {
uint32_t next_index; /* Index of next free slab */
} mpool_node_t;
typedef struct {
/* Freelist head (tagged pointer) */
alignas(MPOOL_CACHE_LINE) mpool_tagged_ptr_t freelist_head;
uint8_t _pad[MPOOL_CACHE_LINE - sizeof(mpool_tagged_ptr_t)];
/* Pool metadata (immutable after init) */
uint8_t *buffer; /* Base address of slab array */
size_t slab_size; /* Usable size per slab */
size_t slab_stride; /* Actual stride (cache-line aligned) */
uint32_t capacity; /* Total number of slabs */
_Atomic uint32_t alloc_count; /* Currently allocated slabs */
} memory_pool_t;
Constants¶
Lifecycle¶
memory_pool_init¶
Initialize the pool and build the freelist.
| Parameter | Type | Description |
|---|---|---|
buffer | void * | Pre-allocated memory (capacity × slab_stride bytes) |
capacity | uint32_t | Number of slabs |
slab_size | size_t | Minimum usable bytes per slab (≥ 4 bytes) |
Slab stride is automatically aligned up to the nearest cache line boundary.
Returns: 0 on success, -1 on invalid parameters.
Allocation¶
memory_pool_alloc¶
Allocate one slab from the pool. Lock-free, thread-safe (MPMC).
Algorithm: 1. Load freelist_head (acquire) 2. Read next_index from the slab at the head 3. CAS: swap head to [gen+1, next_index] 4. On failure, retry (another thread beat us)
Returns: Pointer to slab, or NULL if pool is exhausted.
memory_pool_free¶
Return a slab to the pool. Lock-free, thread-safe.
Algorithm: 1. Calculate slab index from pointer 2. Load freelist_head 3. Set slab's next_index = current head index 4. CAS: swap head to [gen+1, this slab's index] 5. On failure, retry
Double-Free Detection
The pool does not detect double-free at runtime (for performance). Use AddressSanitizer (-fsanitize=address) during development.
Query¶
uint32_t memory_pool_used(const memory_pool_t *pool); /* Allocated count */
uint32_t memory_pool_available(const memory_pool_t *pool); /* Free count */
Usage Examples¶
Basic Allocate/Free¶
#include "memory_pool.h"
#include <stdio.h>
#define POOL_SIZE 256
#define SLAB_SIZE 128 /* 128 usable bytes per allocation */
/* Aligned buffer (stride will be rounded to 128 bytes = 2 cache lines) */
static uint8_t pool_memory[POOL_SIZE * 128] __attribute__((aligned(64)));
static memory_pool_t pool;
int main(void)
{
memory_pool_init(&pool, pool_memory, POOL_SIZE, SLAB_SIZE);
printf("Pool: %u slabs, %u available\n", pool.capacity, memory_pool_available(&pool));
/* Allocate */
void *buf1 = memory_pool_alloc(&pool);
void *buf2 = memory_pool_alloc(&pool);
printf("Allocated: %p, %p (%u used)\n", buf1, buf2, memory_pool_used(&pool));
/* Use the memory */
memset(buf1, 0xAA, SLAB_SIZE);
memset(buf2, 0xBB, SLAB_SIZE);
/* Free */
memory_pool_free(&pool, buf1);
memory_pool_free(&pool, buf2);
printf("After free: %u available\n", memory_pool_available(&pool));
return 0;
}
Multi-Threaded Packet Buffer Pool¶
#include "memory_pool.h"
#include <pthread.h>
#include <stdio.h>
#define PKT_SIZE 2048
#define PKT_COUNT 1024
static uint8_t pkt_memory[PKT_COUNT * 2048] __attribute__((aligned(64)));
static memory_pool_t pkt_pool;
/* Producer: allocates buffers, fills with data */
void *rx_thread(void *arg)
{
(void)arg;
for (int i = 0; i < 100000; i++) {
uint8_t *pkt = (uint8_t *)memory_pool_alloc(&pkt_pool);
if (!pkt) {
__builtin_ia32_pause();
i--;
continue;
}
/* Simulate packet receive into pre-allocated buffer */
pkt[0] = (uint8_t)(i & 0xFF);
/* ... pass pkt pointer to processing thread via SPSC queue ... */
/* For this example, just free it back */
memory_pool_free(&pkt_pool, pkt);
}
return NULL;
}
int main(void)
{
memory_pool_init(&pkt_pool, pkt_memory, PKT_COUNT, PKT_SIZE);
pthread_t threads[4];
for (int i = 0; i < 4; i++) {
pthread_create(&threads[i], NULL, rx_thread, NULL);
}
for (int i = 0; i < 4; i++) {
pthread_join(threads[i], NULL);
}
printf("Final: %u/%u available (should be %u)\n",
memory_pool_available(&pkt_pool), PKT_COUNT, PKT_COUNT);
return 0;
}
Build & Run¶
# Compile (native march for best CAS codegen)
gcc -Wall -Wextra -Werror -O3 -std=c11 -march=native \
-o memory_pool_demo memory_pool.c -lpthread
# Run benchmark
taskset -c 0-3 ./memory_pool_demo
Performance¶
| Threads | Operation | Latency (avg) | Throughput |
|---|---|---|---|
| 1 (no contention) | alloc + free | ~8 ns | 125 Mops/s |
| 2 (SPSC pattern) | alloc + free | ~15 ns | 66 Mops/s |
| 4 (high contention) | alloc + free | ~45 ns | 22 Mops/s |
Contention Impact
With high contention (many threads competing for the same freelist head), CAS retries increase. For best performance, pair with per-thread caching or partition the pool across threads.
CAS Operation (x86-64 Assembly)¶
; memory_pool_alloc hot loop (generated by GCC -O3):
.L_retry:
mov rax, [rdi] ; load freelist_head (acquire)
mov ecx, eax ; extract index (low 32 bits)
cmp ecx, 0xFFFFFFFF ; NULL check
je .L_empty
mov edx, [rbx + rcx*8] ; read next_index from slab
shr rax, 32 ; extract generation
inc eax ; gen + 1
shl rax, 32
or rax, rdx ; new_head = [gen+1 | next_idx]
lock cmpxchg [rdi], rax ; CAS (acquire-release)
jne .L_retry ; retry on failure
Test Output¶
$ taskset -c 0-3 ./build/memory_pool_bench
[memory_pool] Init: 1024 slabs × 128B stride (cache-line aligned)
[memory_pool] Alloc/free correctness: 100000 cycles, 0 leaks ✓
[memory_pool] ABA test: 1000000 concurrent ops, no corruption ✓
[memory_pool] Benchmark (4 threads, 1M alloc+free each):
Average: 42.3 ns/op (CAS retries: 12.4%)
Peak: 15.1 ns/op (no contention)
[memory_pool] Final state: 1024/1024 available ✓
[PASS] All memory_pool tests passed