Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -568,10 +568,11 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY)

if (${TEST} MATCHES "release-.*")
message(VERBOSE "Adding test: ${TESTNAME} only for release configs")
add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} CONFIGURATIONS "Release")
add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke
CONFIGURATIONS "Release")
else()
message(VERBOSE "Adding test: ${TESTNAME}")
add_test(${TESTNAME} ${TESTNAME})
add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke)
endif()
if (${TEST_CATEGORY} MATCHES "perf")
message(VERBOSE "Single threaded test: ${TESTNAME}")
Expand Down
17 changes: 14 additions & 3 deletions src/test/func/memory/memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ void test_static_sized_alloc()
test_static_sized_alloc<asz, dealloc - 1>();
}

template<size_t max_size = bits::one_at_bit(23)>
template<size_t max_size = bits::one_at_bit(20)>
void test_static_sized_allocs()
{
if (max_size < 16)
Expand Down Expand Up @@ -554,14 +554,19 @@ int main(int, char**)
}
#endif
auto start = std::chrono::steady_clock::now();
// Most tests below have substantial internal iteration (size-class
// sweeps, per-offset loops, batch alloc/dealloc), so a large outer
// repetition is redundant for coverage. A small outer count still
// catches consolidation/leak issues that only manifest across
// repeated entry to a test.
#define TEST(testname) \
do \
{ \
auto end = std::chrono::steady_clock::now(); \
auto diff_seconds = \
std::chrono::duration_cast<std::chrono::seconds>(end - start).count(); \
std::cout << "Running " #testname << " @ " << diff_seconds << std::endl; \
for (size_t i = 0; i < 50; i++) \
for (size_t i = 0; i < 3; i++) \
testname(); \
} while (0);

Expand All @@ -574,7 +579,13 @@ int main(int, char**)
TEST(test_calloc_large_bug);
TEST(test_external_pointer_stack);
TEST(test_external_pointer_dealloc_bug);
TEST(test_external_pointer_large);
// test_external_pointer_large allocates ~16MB per object across 32
// objects (~512MB total) and walks every 16MB-aligned interior
// pointer. It is its own internal stress; running it once is
// enough, so it is invoked outside the TEST(...) outer-repeat
// macro.
std::cout << "Running test_external_pointer_large (single pass)" << std::endl;
test_external_pointer_large();
TEST(test_external_pointer);
TEST(test_alloc_16M);
TEST(test_calloc_16M);
Expand Down
11 changes: 9 additions & 2 deletions src/test/perf/contention/contention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,15 @@ int main(int argc, char** argv)
opt::Opt opt(argc, argv);
size_t cores = opt.is<size_t>("--cores", 8);

size_t count = opt.is<size_t>("--swapcount", 1 << 20);
size_t size = opt.is<size_t>("--swapsize", 1 << 18);
// `--smoke` lowers the *defaults* for the iteration knobs so ctest
// runs at modest cost. Explicit `--swapcount` / `--swapsize` on the
// command line still win. The smoke values must remain large
// enough to cross the remote-deallocation cache thresholds
// (otherwise `mem/remotecache.h` and `mem/remoteallocator.h`
// coverage drops sharply).
bool smoke = opt.has("--smoke");
size_t count = opt.is<size_t>("--swapcount", smoke ? 1u << 18 : 1u << 20);
size_t size = opt.is<size_t>("--swapsize", smoke ? 1u << 16 : 1u << 18);
use_malloc = opt.has("--use_malloc");

std::cout << "Allocator is " << (use_malloc ? "System" : "snmalloc")
Expand Down
47 changes: 29 additions & 18 deletions src/test/perf/external_pointer/externalpointer.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <test/measuretime.h>
#include <test/opt.h>
#include <test/setup.h>
#include <test/snmalloc_testlib.h>
#include <test/xoroshiro.h>
Expand Down Expand Up @@ -50,21 +51,8 @@ namespace test
snmalloc::debug_check_empty();
}

void test_external_pointer(xoroshiro::p128r64& r)
void test_external_pointer(xoroshiro::p128r64& r, size_t iterations)
{
// This is very slow on Windows at the moment. Until this is fixed, help
// CI terminate.
#if defined(NDEBUG) && !defined(_MSC_VER)
static constexpr size_t iterations = 10000000;
#else
# ifdef _MSC_VER
// Windows Debug build is very slow on this test.
// Reduce complexity to balance CI times.
static constexpr size_t iterations = 50000;
# else
static constexpr size_t iterations = 100000;
# endif
#endif
setup(r);

{
Expand Down Expand Up @@ -93,15 +81,38 @@ namespace test
}
}

int main(int, char**)
int main(int argc, char** argv)
{
setup();

xoroshiro::p128r64 r;
opt::Opt opt(argc, argv);

// Default iteration count varies by build (Release runs many more
// iterations). Smoke mode shrinks both to the smallest count that
// still exercises every interior-pointer dispatch path.
size_t cli_default;
// This is very slow on Windows at the moment. Until this is fixed, help
// CI terminate.
#if defined(NDEBUG) && !defined(_MSC_VER)
cli_default = 10000000;
#elif defined(_MSC_VER)
// Windows Debug build is very slow on this test.
// Reduce complexity to balance CI times.
cli_default = 50000;
#else
cli_default = 100000;
#endif
size_t iterations = opt.has("--smoke") ? 10000 : cli_default;

size_t nn = snmalloc::Debug ? 30 : 3;
// Outer-repeat count: Debug repeats 30x to amortise setup, Release 3x.
// Smoke shrinks both ends; one repeat is enough to hit every path
// since `setup()` re-randomises the object table each call.
size_t nn_default = snmalloc::Debug ? 30 : 3;
size_t nn = opt.has("--smoke") ? 1 : nn_default;

xoroshiro::p128r64 r;

for (size_t n = 0; n < nn; n++)
test::test_external_pointer(r);
test::test_external_pointer(r, iterations);
return 0;
}
33 changes: 20 additions & 13 deletions src/test/perf/large_alloc/large_alloc.cc
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
#include <test/measuretime.h>
#include <test/opt.h>
#include <test/setup.h>
#include <test/snmalloc_testlib.h>

using namespace snmalloc;

static constexpr size_t ALLOC_SIZE = 800 * 1024; // 800 KB
static constexpr size_t ITERATIONS = 100000;

void test_alloc_dealloc_cycle()
void test_alloc_dealloc_cycle(size_t iterations)
{
{
MeasureTime m;
m << "Alloc/dealloc 800KB x " << ITERATIONS;
m << "Alloc/dealloc 800KB x " << iterations;

for (size_t i = 0; i < ITERATIONS; i++)
for (size_t i = 0; i < iterations; i++)
{
void* p = snmalloc::alloc(ALLOC_SIZE);
SNMALLOC_CHECK(p != nullptr);
Expand All @@ -24,15 +24,15 @@ void test_alloc_dealloc_cycle()
snmalloc::debug_check_empty();
}

void test_batch_alloc_then_dealloc()
void test_batch_alloc_then_dealloc(size_t iterations)
{
static constexpr size_t BATCH = 128;

void* ptrs[BATCH];

MeasureTime m;
m << "Batch alloc then dealloc 800KB x " << BATCH;
for (size_t j = 0; j < ITERATIONS / BATCH; j++)
for (size_t j = 0; j < iterations / BATCH; j++)
{
for (size_t i = 0; i < BATCH; i++)
{
Expand All @@ -49,13 +49,13 @@ void test_batch_alloc_then_dealloc()
snmalloc::debug_check_empty();
}

void test_alloc_dealloc_with_touch()
void test_alloc_dealloc_with_touch(size_t iterations)
{
{
MeasureTime m;
m << "Alloc/touch/dealloc 800KB x " << ITERATIONS;
m << "Alloc/touch/dealloc 800KB x " << iterations;

for (size_t i = 0; i < ITERATIONS; i++)
for (size_t i = 0; i < iterations; i++)
{
char* p = static_cast<char*>(snmalloc::alloc(ALLOC_SIZE));
SNMALLOC_CHECK(p != nullptr);
Expand All @@ -71,13 +71,20 @@ void test_alloc_dealloc_with_touch()
snmalloc::debug_check_empty();
}

int main(int, char**)
int main(int argc, char** argv)
{
setup();

test_alloc_dealloc_cycle();
test_batch_alloc_then_dealloc();
test_alloc_dealloc_with_touch();
opt::Opt opt(argc, argv);
// Each test does alloc/dealloc cycles driven by `iterations`. The
// batch test divides by BATCH=128, so the smoke value is chosen so
// that `smoke / 128 >= 1` (i.e. the batch test still runs at least
// one full batch round).
size_t iterations = opt.has("--smoke") ? 8192 : 100000;

test_alloc_dealloc_cycle(iterations);
test_batch_alloc_then_dealloc(iterations);
test_alloc_dealloc_with_touch(iterations);

return 0;
}
8 changes: 8 additions & 0 deletions src/test/perf/lotsofthreads/lotsofthread.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ int main()
#else
size_t iterations = 200000;
#endif
#ifndef NDEBUG
// Debug builds run with full instrumentation enabled and are
// ~10x slower per iteration. The cross-thread batch behaviour
// this benchmark stresses is observable at much lower counts;
// reduce iterations so this test does not dominate Debug ctest
// wall-time. Release builds are unaffected.
iterations /= 10;
#endif

int threadcount = 8;
vector<thread> threads;
Expand Down
20 changes: 19 additions & 1 deletion src/test/perf/memcpy/memcpy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,15 @@ size_t my_random()

std::vector<Shape> allocs;

// Number of distinct destination buffers per size class. Each `test()`
// call iterates over every entry in `allocs` and runs the memcpy
// implementation under measurement, so this is the per-size repeat
// count. Set by `main()` from `--smoke`.
size_t allocs_per_size = 1000;

void shape(size_t size)
{
for (size_t i = 0; i < 1000; i++)
for (size_t i = 0; i < allocs_per_size; i++)
{
auto rsize = size * 2;
auto offset = 0;
Expand Down Expand Up @@ -70,6 +76,12 @@ void test(
{
auto src = snmalloc::alloc(size);
shape(size);
// The outer loop is a measurement-variance loop, not a coverage knob:
// it gathers ten timing samples per size for the perf statistics.
// Under `--smoke` it still runs ten times, but each `test_memcpy`
// call exercises only `allocs_per_size` (smoke value) memcpys, so the
// total work is small. Coverage is unaffected because every code path
// is hit on the first pass.
for (size_t i = 0; i < 10; i++)
{
MeasureTime m(true);
Expand Down Expand Up @@ -108,6 +120,12 @@ int main(int argc, char** argv)
opt::Opt opt(argc, argv);
bool full_test = opt.has("--full_test");

// Number of destination buffers per size class. Smoke mode shrinks
// it dramatically because each `test()` call already runs ten
// measurement passes per size, which is more than enough to exercise
// every memcpy code path.
allocs_per_size = opt.has("--smoke") ? 100 : 1000;

// size_t size = 0;
auto mc_platform_checked = [](void* dst, const void* src, size_t len) {
memcpy_platform_checked(dst, src, len);
Expand Down
8 changes: 7 additions & 1 deletion src/test/perf/msgpass/msgpass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,16 @@ int main(int argc, char** argv)
struct params param;

opt::Opt opt(argc, argv);
// `--smoke` lowers the *default* per-producer batch count so ctest
// runs at modest cost. Explicit `--batches` on the command line
// still wins. The smoke value must remain large enough for the
// cross-thread remote-deallocation cache thresholds in
// `mem/remotecache.h` / `mem/remoteallocator.h` to fire.
size_t batches_default = opt.has("--smoke") ? 1u << 18 : 1024 * 1024;
param.N_PRODUCER = opt.is<size_t>("--producers", 3);
param.N_CONSUMER = opt.is<size_t>("--consumers", 3);
param.N_PROXY = opt.is<size_t>("--proxies", 2);
param.N_PRODUCER_BATCH = opt.is<size_t>("--batches", 1024 * 1024);
param.N_PRODUCER_BATCH = opt.is<size_t>("--batches", batches_default);
param.N_MAX_OUTSTANDING = opt.is<size_t>("--max-out", 4 * 1024);
param.N_MAX_BATCH_SIZE = opt.is<size_t>("--max-batch", 16);

Expand Down
26 changes: 17 additions & 9 deletions src/test/perf/singlethread/singlethread.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <test/measuretime.h>
#include <test/opt.h>
#include <test/setup.h>
#include <test/snmalloc_testlib.h>
#include <unordered_set>
Expand Down Expand Up @@ -62,24 +63,31 @@ void test_alloc_dealloc(size_t count, size_t size, bool write)
snmalloc::debug_check_empty();
}

int main(int, char**)
int main(int argc, char** argv)
{
setup();

opt::Opt opt(argc, argv);
// Default `count` exercises sizeclass dispatch many times; under
// `--smoke` we keep one alloc/dealloc cycle through every code
// path but cut the bulk repetitions.
size_t count_small = opt.has("--smoke") ? 1u << 12 : 1u << 15;
size_t count_large = opt.has("--smoke") ? 1u << 8 : 1u << 10;

for (size_t size = 16; size <= 128; size <<= 1)
{
test_alloc_dealloc<ZeroMem::NoZero>(1 << 15, size, false);
test_alloc_dealloc<ZeroMem::NoZero>(1 << 15, size, true);
test_alloc_dealloc<ZeroMem::YesZero>(1 << 15, size, false);
test_alloc_dealloc<ZeroMem::YesZero>(1 << 15, size, true);
test_alloc_dealloc<ZeroMem::NoZero>(count_small, size, false);
test_alloc_dealloc<ZeroMem::NoZero>(count_small, size, true);
test_alloc_dealloc<ZeroMem::YesZero>(count_small, size, false);
test_alloc_dealloc<ZeroMem::YesZero>(count_small, size, true);
}

for (size_t size = 1 << 12; size <= 1 << 17; size <<= 1)
{
test_alloc_dealloc<ZeroMem::NoZero>(1 << 10, size, false);
test_alloc_dealloc<ZeroMem::NoZero>(1 << 10, size, true);
test_alloc_dealloc<ZeroMem::YesZero>(1 << 10, size, false);
test_alloc_dealloc<ZeroMem::YesZero>(1 << 10, size, true);
test_alloc_dealloc<ZeroMem::NoZero>(count_large, size, false);
test_alloc_dealloc<ZeroMem::NoZero>(count_large, size, true);
test_alloc_dealloc<ZeroMem::YesZero>(count_large, size, false);
test_alloc_dealloc<ZeroMem::YesZero>(count_large, size, true);
}

return 0;
Expand Down
Loading