From 1b54b42b5d31f1f1d72f2760438f2a323ab44e73 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 13 May 2026 10:50:22 +0300
Subject: [PATCH 1/4] Refactor GPU benchmarking with unified helper framework

Refactored the GPU benchmarking system to use new helper classes (`IBenchmark`, `GPUBenchmarkHelper`, and related types), replacing manual pipeline
nuffer, and query management with a unified, extensible framework. Benchmarks now use a common `Aggregator` for session orchestration, CLI parsing, baseline comparison, and reporting. Added support for hierarchical variant names, focus/median-of-K runs, robust JSON output, and improved console reporting. Updated main application and all benchmarks to use the new structure, improving maintainability and extensibility.
---
 .../benchmarks/CDiscreteSamplerBenchmark.h    |  398 ++-
 .../benchmarks/CSamplerBenchmark.h            |  273 +--
 37_HLSLSamplingTests/main.cpp                 |  393 ++-
 64_EmulatedFloatTest/main.cpp                 | 2131 ++++++++---------
 .../nbl/examples/Benchmark/BenchmarkCli.h     |  125 +
 .../nbl/examples/Benchmark/BenchmarkConsole.h |  526 ++++
 .../nbl/examples/Benchmark/BenchmarkJson.h    |  285 +++
 .../nbl/examples/Benchmark/BenchmarkTypes.h   |  211 ++
 .../examples/Benchmark/GPUBenchmarkHelper.h   |  693 ++++++
 .../nbl/examples/Benchmark/IBenchmark.h       |  409 ++++
 10 files changed, 3634 insertions(+), 1810 deletions(-)
 create mode 100644 common/include/nbl/examples/Benchmark/BenchmarkCli.h
 create mode 100644 common/include/nbl/examples/Benchmark/BenchmarkConsole.h
 create mode 100644 common/include/nbl/examples/Benchmark/BenchmarkJson.h
 create mode 100644 common/include/nbl/examples/Benchmark/BenchmarkTypes.h
 create mode 100644 common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h
 create mode 100644 common/include/nbl/examples/Benchmark/IBenchmark.h
diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
index b2a2fad9a..d6289f54b 100644
--- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
+++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
@@ -6,187 +6,144 @@
 #include <nbl/builtin/hlsl/sampling/alias_table_builder.h>
 #include <nbl/builtin/hlsl/sampling/cumulative_probability_builder.h>
 #include "app_resources/common/discrete_sampler_bench.hlsl"
+#include "nbl/examples/Benchmark/IBenchmark.h"
+#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h"
 
 #include <random>
 
 using namespace nbl;
 
-class CDiscreteSamplerBenchmark
+class CDiscreteSamplerBenchmark : public GPUBenchmark
 {
    public:
-   struct SetupData
+   // Declared up-front because it's used as the index domain for m_pipelineIdx[]
+   // (a member-array bound needs the type complete in declaration order).
+   enum class SamplerKind : uint32_t
    {
-      core::smart_refctd_ptr<ILogicalDevice>    device;
-      core::smart_refctd_ptr<CVulkanConnection> api;
-      core::smart_refctd_ptr<IAssetManager>     assetMgr;
-      core::smart_refctd_ptr<ILogger>          logger;
-      IPhysicalDevice*                          physicalDevice;
-      std::string                                      packedAliasAShaderKey;
-      std::string                                      packedAliasBShaderKey;
-      std::string                                      cumProbShaderKey;
-      std::string                                      cumProbYoloShaderKey;
-      std::string                                      cumProbEytzingerShaderKey;
-      uint32_t                                         computeFamilyIndex;
-      uint32_t                                         dispatchGroupCount;
+      AliasPackedA = 0,
+      AliasPackedB,
+      CumProbCompare,
+      CumProbYolo,
+      CumProbEytzinger,
+      Count
    };
 
-   void setup(const SetupData& data)
+   struct SetupData
    {
-      m_device             = data.device;
-      m_logger             = data.logger;
-      m_assetMgr           = data.assetMgr;
-      m_dispatchGroupCount = data.dispatchGroupCount;
-      m_physicalDevice     = data.physicalDevice;
+      core::smart_refctd_ptr<IAssetManager> assetMgr;
+      // Each pipeline is independent; main.cpp can pick precompiled or runtime per
+      // pipeline by passing ShaderVariant::Precompiled(get_spirv_key<...>()) or
+      // ShaderVariant::FromSource(path, defines) respectively.
+      GPUBenchmarkHelper::ShaderVariant packedAliasAVariant;
+      GPUBenchmarkHelper::ShaderVariant packedAliasBVariant;
+      GPUBenchmarkHelper::ShaderVariant cumProbVariant;
+      GPUBenchmarkHelper::ShaderVariant cumProbYoloVariant;
+      GPUBenchmarkHelper::ShaderVariant cumProbEytzingerVariant;
+      hlsl::uint32_t3                   dispatchGroupCount;
+      uint64_t                          targetBudgetMs = 400; // wall-clock budget per sweep row
+      // N values the sweep cycles through. Dispatch count per row is auto-sized
+      // by runTimedBudgeted to hit the budget.
+      std::span<const uint32_t> sweepNs;
+   };
 
-      m_queue = m_device->getQueue(data.computeFamilyIndex, 0);
+   // Shape is derivable from SetupData; expose it so the caller can use it
+   // both to configure the bench and to build the matching RunContext for the
+   // span that runs this bench
+   static WorkloadShape shapeFor(const SetupData& data)
+   {
+      const uint32_t totalThreads       = data.dispatchGroupCount.x * data.dispatchGroupCount.y * data.dispatchGroupCount.z * WORKGROUP_SIZE;
+      const uint64_t samplesPerDispatch = uint64_t(totalThreads) * uint64_t(BENCH_ITERS);
+      return {
+         .workgroupSize      = {WORKGROUP_SIZE, 1u, 1u},
+         .dispatchGroupCount = data.dispatchGroupCount,
+         .samplesPerDispatch = samplesPerDispatch,
+      };
+   }
 
-      // Staging-upload utility. Without this, BDA buffers land in host-visible (system RAM)
-      // and every sampler load becomes a PCIe round-trip instead of hitting VRAM/L2.
-      m_utils = IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr(m_logger));
+   CDiscreteSamplerBenchmark(Aggregator& aggregator, const SetupData& data)
+      : GPUBenchmark(aggregator, GPUBenchmark::SetupData{
+                                    .name             = {}, // per-row names synthesized at run time
+                                    .warmupDispatches = 0,
+                                    .shape            = shapeFor(data),
+                                    .targetBudgetMs   = data.targetBudgetMs,
+                                 })
+   {
+      const uint32_t totalThreads = data.dispatchGroupCount.x * data.dispatchGroupCount.y * data.dispatchGroupCount.z * WORKGROUP_SIZE;
 
-      // Command pool + buffers
-      m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-      m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf);
+      m_assetMgr = data.assetMgr;
+      m_sweepNs  = data.sweepNs;
 
-      // Timestamp query pool
+      for (const uint32_t N : m_sweepNs)
       {
-         IQueryPool::SCreationParams qp = {};
-         qp.queryType                          = IQueryPool::TYPE::TIMESTAMP;
-         qp.queryCount                         = 2;
-         qp.pipelineStatisticsFlags            = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
-         m_queryPool                           = m_device->createQueryPool(qp);
+         const std::string nStr = std::format("N={}", N);
+         for (const auto& v : kSweepVariants)
+            registerVariant({nStr, v.family, v.leaf});
       }
 
-      const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE;
-
       // Shared output buffer (size only depends on thread count). GPU writes via BDA and
-      // nothing reads it on the CPU, so pin it to device-local VRAM.
-      {
-         IGPUBuffer::SCreationParams bp                      = {};
-         bp.size                                                    = totalThreads * sizeof(uint32_t);
-         bp.usage                                                   = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-         m_outputBuf                                                = m_device->createBuffer(std::move(bp));
-         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputBuf->getMemoryReqs();
-         reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits();
-         m_device->allocate(reqs, m_outputBuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-      }
-
-      // Pipelines (N-independent; only push constants change per run)
-      m_packedAliasAPipeline     = createPipeline<PackedAliasABPushConstants>(data.packedAliasAShaderKey, m_packedAliasAPplnLayout, "alias-packed-A");
-      m_packedAliasBPipeline     = createPipeline<PackedAliasABPushConstants>(data.packedAliasBShaderKey, m_packedAliasBPplnLayout, "alias-packed-B");
-      m_cumProbPipeline          = createPipeline<CumProbPushConstants>(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator");
-      m_cumProbYoloPipeline      = createPipeline<CumProbPushConstants>(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo");
-      m_cumProbEytzingerPipeline = createPipeline<CumProbPushConstants>(data.cumProbEytzingerShaderKey, m_cumProbEytzingerPplnLayout, "cumprob-eytzinger");
+      // nothing reads it on the CPU.
+      m_outputBuf = createBdaOutputBuffer(totalThreads * sizeof(uint32_t)).buf;
+
+      // Pipelines (N-independent; only push constants change per run). Indices
+      // into m_pipelines (GPUBenchmarkHelper) are stored in the same order as SamplerKind
+      // so the sweep's variant table can index by enum directly.
+      m_pipelineIdx[static_cast<size_t>(SamplerKind::AliasPackedA)]     = createPipeline(data.packedAliasAVariant, m_assetMgr, sizeof(PackedAliasABPushConstants), "alias-packed-A");
+      m_pipelineIdx[static_cast<size_t>(SamplerKind::AliasPackedB)]     = createPipeline(data.packedAliasBVariant, m_assetMgr, sizeof(PackedAliasABPushConstants), "alias-packed-B");
+      m_pipelineIdx[static_cast<size_t>(SamplerKind::CumProbCompare)]   = createPipeline(data.cumProbVariant, m_assetMgr, sizeof(CumProbPushConstants), "cumprob-comparator");
+      m_pipelineIdx[static_cast<size_t>(SamplerKind::CumProbYolo)]      = createPipeline(data.cumProbYoloVariant, m_assetMgr, sizeof(CumProbPushConstants), "cumprob-yolo");
+      m_pipelineIdx[static_cast<size_t>(SamplerKind::CumProbEytzinger)] = createPipeline(data.cumProbEytzingerVariant, m_assetMgr, sizeof(CumProbPushConstants), "cumprob-eytzinger");
    }
 
-   // DispatchScheduler: uint32_t N -> std::pair<uint32_t warmup, uint32_t bench>.
-   // Lets the caller trade wall-clock for statistical stability per size:
-   // big-N runs are DRAM-bound and need fewer dispatches to hit the same total sample count.
-   struct DispatchCounts
+   // Rows are synthesized per (N, variant), not a single named entry, so
+   // each row checks cli.focusVariants individually. The aggregator's silent
+   // flag selects which half (focused / unfocused) we contribute to.
+   void run() override
    {
-      uint32_t warmup;
-      uint32_t bench;
-   };
-
-   template<typename DispatchScheduler>
-   void runSweep(const std::vector<uint32_t>& tableSizes, DispatchScheduler scheduler)
-   {
-      const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE;
-      m_logger->log("=== GPU Discrete Sampler Benchmark sweep (%u threads * %u iters/thread; wg=%u; dispatches chosen per-N) ===",
-         ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE);
-      m_logger->log("%12s | %-34s | %12s | %12s | %12s | %10s", ILogger::ELL_PERFORMANCE,
-         "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches");
+      const bool focusedPhase = isFocusPhase();
+      // Warmup is small and fixed; budgeted measurement auto-sizes the
+      // measured-dispatch count to hit getTargetBudgetMs().
+      constexpr uint32_t kWarmupDispatches = 64;
 
-      for (uint32_t N : tableSizes)
+      for (const uint32_t N : m_sweepNs)
       {
-         const DispatchCounts dc = scheduler(N);
-         buildAndUpload(N);
-         // Packed A wins N<=16k; Packed B wins N>=32k. SoA and Packed C were dominated
-         // across every N measured, removed from the sweep.
-         runSingle(N, "AliasTable (packed A, 4 B)", m_packedAliasAPipeline, m_packedAliasAPplnLayout, SamplerKind::AliasPackedA, dc.warmup, dc.bench);
-         runSingle(N, "AliasTable (packed B, 8 B)", m_packedAliasBPipeline, m_packedAliasBPplnLayout, SamplerKind::AliasPackedB, dc.warmup, dc.bench);
-         runSingle(N, "CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, SamplerKind::CumProbCompare, dc.warmup, dc.bench);
-         runSingle(N, "CumulativeProbability (YOLO)", m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo, dc.warmup, dc.bench);
-         runSingle(N, "CumulativeProbability (Eytzinger)", m_cumProbEytzingerPipeline, m_cumProbEytzingerPplnLayout, SamplerKind::CumProbEytzinger, dc.warmup, dc.bench);
-         releaseTables();
+         const std::string nStr = std::format("N={}", N);
+         bool              built = false;
+         for (const auto& [family, leaf, kind] : kSweepVariants)
+         {
+            core::vector<core::string> name      = {nStr, family, leaf};
+            const bool                 inFocus   = isFocused(name);
+            const bool                 shouldRun = focusedPhase ? inFocus : !inFocus;
+            if (!shouldRun)
+               continue;
+            if (!built)
+            {
+               buildAndUpload(N);
+               built = true;
+            }
+            runSingle(N, std::move(name), kind, kWarmupDispatches);
+         }
+         if (built)
+            releaseTables();
       }
    }
 
-   // Convenience: sweep with fixed dispatch counts for every size.
-   void runSweep(const std::vector<uint32_t>& tableSizes, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000)
-   {
-      runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts
-         { return {warmupIterations, benchmarkIterations}; });
-   }
-
    private:
-   enum class SamplerKind
+   // (family, leaf, kind) for every variant the sweep runs.
+   struct SweepVariant
    {
-      AliasPackedA,
-      AliasPackedB,
-      CumProbCompare,
-      CumProbYolo,
-      CumProbEytzinger
+      const char* family; // e.g. "AliasTable"
+      const char* leaf;   // e.g. "packed A, 4 B"
+      SamplerKind kind;
+   };
+   static constexpr SweepVariant kSweepVariants[] = {
+      {"AliasTable", "packed A, 4 B", SamplerKind::AliasPackedA},
+      {"AliasTable", "packed B, 8 B", SamplerKind::AliasPackedB},
+      {"CumulativeProbability", "comparator", SamplerKind::CumProbCompare},
+      {"CumulativeProbability", "YOLO", SamplerKind::CumProbYolo},
+      {"CumulativeProbability", "Eytzinger", SamplerKind::CumProbEytzinger},
    };
 
-   template<typename PushConstantT>
-   core::smart_refctd_ptr<IGPUComputePipeline> createPipeline(const std::string& shaderKey, core::smart_refctd_ptr<IGPUPipelineLayout>& outLayout, const char* tag)
-   {
-      const SPushConstantRange pcRange = {
-         .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
-         .offset     = 0,
-         .size       = sizeof(PushConstantT)};
-      auto layout = m_device->createPipelineLayout({&pcRange, 1});
-      if (!layout)
-         m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", ILogger::ELL_ERROR, tag);
-
-      IAssetLoader::SAssetLoadParams lp = {};
-      lp.logger                                = m_logger.get();
-      lp.workingDirectory                      = "app_resources";
-      auto bundle                              = m_assetMgr->getAsset(shaderKey, lp);
-      auto source                              = IAsset::castDown<IShader>(bundle.getContents()[0]);
-      auto shader                              = m_device->compileShader({.source = source.get()});
-      if (!shader)
-         m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", ILogger::ELL_ERROR, tag);
-
-      IGPUComputePipeline::SCreationParams pp = {};
-      pp.layout                                      = layout.get();
-      pp.shader.shader                               = shader.get();
-      pp.shader.entryPoint                           = "main";
-      if (m_device->getEnabledFeatures().pipelineExecutableInfo)
-      {
-         pp.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
-      }
-
-      core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
-      if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline))
-         m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", ILogger::ELL_ERROR, tag);
-
-      if (m_device->getEnabledFeatures().pipelineExecutableInfo)
-      {
-         auto report = system::to_string(pipeline->getExecutableInfo());
-         m_logger->log("%s Sampling Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, tag, report.c_str());
-      }
-      outLayout = std::move(layout);
-      return pipeline;
-   }
-
-   core::smart_refctd_ptr<IGPUBuffer> createBdaBuffer(const void* srcData, size_t bytes)
-   {
-      IGPUBuffer::SCreationParams bp = {};
-      bp.size                               = bytes;
-      bp.usage                              = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) |
-         IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT |
-         IGPUBuffer::EUF_TRANSFER_DST_BIT;
-
-      core::smart_refctd_ptr<IGPUBuffer> buf;
-      auto                                      future = m_utils->createFilledDeviceLocalBufferOnDedMem(
-         SIntendedSubmitInfo {.queue = m_queue}, std::move(bp), srcData);
-      future.move_into(buf);
-      return buf;
-   }
-
-   void buildAndUpload(uint32_t N)
+   void buildAndUpload(const uint32_t N)
    {
       m_currentN = N;
 
@@ -204,8 +161,8 @@ class CDiscreteSamplerBenchmark
       std::vector<float>    aliasPdf;
       m_aliasTableN = sampling::AliasTableBuilder<float>::build({weights}, aliasProb, aliasIdx, aliasPdf);
 
-      constexpr uint32_t                                         kPackedLog2N = 26u;
-      std::vector<uint32_t>                                      packedA(m_aliasTableN);
+      constexpr uint32_t                              kPackedLog2N = 26u;
+      std::vector<uint32_t>                           packedA(m_aliasTableN);
       std::vector<sampling::PackedAliasEntryB<float>> packedB(m_aliasTableN);
       sampling::AliasTableBuilder<float>::packA<kPackedLog2N>({aliasProb}, {aliasIdx}, packedA.data());
       sampling::AliasTableBuilder<float>::packB<kPackedLog2N>({aliasProb}, {aliasIdx}, {aliasPdf}, packedB.data());
@@ -236,104 +193,45 @@ class CDiscreteSamplerBenchmark
       m_cumProbEytzingerBuf = nullptr;
    }
 
-   void runSingle(uint32_t N, const char* name, const core::smart_refctd_ptr<IGPUComputePipeline>& pipeline, const core::smart_refctd_ptr<IGPUPipelineLayout>& layout, SamplerKind kind, uint32_t warmupIterations, uint32_t benchmarkIterations)
+   void runSingle(uint32_t N, core::vector<core::string> name, SamplerKind kind, uint32_t warmupIterations)
    {
-      m_device->waitIdle();
-
-      // Everything (warmup, timestamped bench, cooldown) goes into ONE cmdbuf and ONE
-      // submit. Serial submissions with semaphore waits between them would add sync cost
-      // to every dispatch and prevent the driver from overlapping adjacent dispatches.
-      // With a single cmdbuf the driver pipelines freely, and GPU memory latency is
-      // hidden by warp hyperthreading rather than by cross-submit overlap.
-      //
-      // Layout: [warmup dispatches] [ts 0] [bench dispatches] [ts 1] [cooldown dispatches]
-      // Warmup brings clocks + caches to steady state before ts 0. Cooldown keeps the
-      // same steady-state context alive across ts 1 so the trailing bench dispatches
-      // don't measure a tail where the GPU is already winding down.
-      const uint32_t cooldownIterations = warmupIterations;
-
-      m_benchCmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-      m_benchCmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-      m_benchCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
-      m_benchCmdbuf->bindComputePipeline(pipeline.get());
-
-      if (kind == SamplerKind::AliasPackedA || kind == SamplerKind::AliasPackedB)
-      {
-         PackedAliasABPushConstants pc = {};
-         pc.entriesAddress             = (kind == SamplerKind::AliasPackedA ? m_packedAliasABuf : m_packedAliasBBuf)->getDeviceAddress();
-         pc.pdfAddress                 = m_aliasPdfBuf->getDeviceAddress();
-         pc.outputAddress              = m_outputBuf->getDeviceAddress();
-         pc.tableSize                  = m_aliasTableN;
-         m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
-      }
-      else
-      {
-         CumProbPushConstants pc  = {};
-         const auto&          buf = (kind == SamplerKind::CumProbEytzinger) ? m_cumProbEytzingerBuf : m_cumProbBuf;
-         pc.cumProbAddress        = buf->getDeviceAddress();
-         pc.outputAddress         = m_outputBuf->getDeviceAddress();
-         pc.tableSize             = N;
-         m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
-      }
-
-      for (uint32_t i = 0u; i < warmupIterations; ++i)
-         m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
-      m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
-      for (uint32_t i = 0u; i < benchmarkIterations; ++i)
-         m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
-      m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
-      for (uint32_t i = 0u; i < cooldownIterations; ++i)
-         m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
-      m_benchCmdbuf->end();
-
-      auto                                                 semaphore   = m_device->createSemaphore(0u);
-      const IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}};
-      const IQueue::SSubmitInfo::SSemaphoreInfo     signalSem[] = {
-         {.semaphore = semaphore.get(), .value = 1u, .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
-      IQueue::SSubmitInfo submit = {};
-      submit.commandBuffers             = benchCmds;
-      submit.signalSemaphores           = signalSem;
-      m_queue->submit({&submit, 1u});
-
-      m_device->waitIdle();
-
-      uint64_t   timestamps[2] = {};
-      const auto flags         = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) |
-         core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
-      m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags);
-
-      constexpr uint32_t benchIters      = BENCH_ITERS;
-      const float64_t    timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds);
-      const float64_t    elapsed_ns      = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod;
-      const uint64_t     totalThreads    = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE);
-      const uint64_t     totalSamples    = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters);
-      const float64_t    ps_per_sample   = elapsed_ns * 1e3 / float64_t(totalSamples);
-      const float64_t    gsamples_per_s  = float64_t(totalSamples) / elapsed_ns;
-      const float64_t    elapsed_ms      = elapsed_ns * 1e-6;
-
-      m_logger->log("%12u | %-34s | %12.3f | %12.3f | %12.3f | %10u",
-         ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations);
+      // Pipeline + push constants are bound *once* in bindOnce, the inner loop is just
+      // dispatch(...). Putting binds inside dispatchOne would inflate ps/sample on the
+      // tighter samplers.
+      const PipelineEntry& pe = m_pipelines[m_pipelineIdx[size_t(kind)]];
+
+      const TimingResult timingResult = runTimedBudgeted(warmupIterations, getTargetBudgetMs(),
+         [&](IGPUCommandBuffer* cb)
+         {
+            if (kind == SamplerKind::AliasPackedA || kind == SamplerKind::AliasPackedB)
+            {
+               PackedAliasABPushConstants pc = {};
+               pc.entriesAddress             = (kind == SamplerKind::AliasPackedA ? m_packedAliasABuf : m_packedAliasBBuf)->getDeviceAddress();
+               pc.pdfAddress                 = m_aliasPdfBuf->getDeviceAddress();
+               pc.outputAddress              = m_outputBuf->getDeviceAddress();
+               pc.tableSize                  = m_aliasTableN;
+               defaultBindAndPush(cb, pe, pc);
+            }
+            else
+            {
+               CumProbPushConstants pc  = {};
+               const auto&          buf = (kind == SamplerKind::CumProbEytzinger) ? m_cumProbEytzingerBuf : m_cumProbBuf;
+               pc.cumProbAddress        = buf->getDeviceAddress();
+               pc.outputAddress         = m_outputBuf->getDeviceAddress();
+               pc.tableSize             = N;
+               defaultBindAndPush(cb, pe, pc);
+            }
+         },
+         [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); },
+         samplesForCurrentRow());
+
+      record(std::move(name), timingResult, pe.stats);
    }
 
-   core::smart_refctd_ptr<ILogicalDevice>    m_device;
-   core::smart_refctd_ptr<ILogger>          m_logger;
-   core::smart_refctd_ptr<IAssetManager>     m_assetMgr;
-   core::smart_refctd_ptr<IUtilities>        m_utils;
-   core::smart_refctd_ptr<IGPUCommandPool>   m_cmdpool;
-   core::smart_refctd_ptr<IGPUCommandBuffer> m_benchCmdbuf;
-   core::smart_refctd_ptr<IQueryPool>        m_queryPool;
+   core::smart_refctd_ptr<IAssetManager> m_assetMgr;
 
-   // Pipelines (set up once)
-   core::smart_refctd_ptr<IGPUPipelineLayout>  m_packedAliasAPplnLayout;
-   core::smart_refctd_ptr<IGPUComputePipeline> m_packedAliasAPipeline;
-   core::smart_refctd_ptr<IGPUPipelineLayout>  m_packedAliasBPplnLayout;
-   core::smart_refctd_ptr<IGPUComputePipeline> m_packedAliasBPipeline;
-   core::smart_refctd_ptr<IGPUPipelineLayout>  m_cumProbPplnLayout;
-   core::smart_refctd_ptr<IGPUComputePipeline> m_cumProbPipeline;
-   core::smart_refctd_ptr<IGPUPipelineLayout>  m_cumProbYoloPplnLayout;
-   core::smart_refctd_ptr<IGPUComputePipeline> m_cumProbYoloPipeline;
-   core::smart_refctd_ptr<IGPUPipelineLayout>  m_cumProbEytzingerPplnLayout;
-   core::smart_refctd_ptr<IGPUComputePipeline> m_cumProbEytzingerPipeline;
+   // Indices into m_pipelines (GPUBenchmarkHelper), indexed by SamplerKind.
+   uint32_t m_pipelineIdx[size_t(SamplerKind::Count)] = {};
 
    // Per-N data buffers (rebuilt each sweep step). pdf[] is shared between A and B.
    core::smart_refctd_ptr<IGPUBuffer> m_aliasPdfBuf;
@@ -344,11 +242,9 @@ class CDiscreteSamplerBenchmark
 
    // Shared
    core::smart_refctd_ptr<IGPUBuffer> m_outputBuf;
-   IQueue*                            m_queue              = nullptr;
-   IPhysicalDevice*                   m_physicalDevice     = nullptr;
-   uint32_t                                  m_dispatchGroupCount = 0;
-   uint32_t                                  m_currentN           = 0;
-   uint32_t                                  m_aliasTableN        = 0;
+   uint32_t                           m_currentN    = 0;
+   uint32_t                           m_aliasTableN = 0;
+   std::span<const uint32_t>          m_sweepNs;
 };
 
 #endif
diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
index 4f63c6fde..dd7149829 100644
--- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
+++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
@@ -7,249 +7,54 @@
 
 #include <nabla.h>
 #include "nbl/examples/examples.hpp"
-#include "../app_resources/common/sampler_bench_pc.hlsl"
+#include "nbl/examples/Benchmark/IBenchmark.h"
+#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h"
+#include "app_resources/common/sampler_bench_pc.hlsl"
 
 using namespace nbl;
 
 // Measures GPU execution time of a sampler shader using GPU timestamp queries.
-class CSamplerBenchmark
+// Output is implicit BDA addressed via SamplerBenchPushConstants. GPU plumbing
+// (pipeline / buffer / timestamp queries) comes from GPUBenchmarkHelper; the
+// bench-side glue here is PC layout + per-run dispatch + result recording.
+class CSamplerBenchmark : public GPUBenchmark
 {
-public:
-	struct SetupData
-	{
-		core::smart_refctd_ptr<video::ILogicalDevice> device;
-		core::smart_refctd_ptr<video::CVulkanConnection> api;
-		core::smart_refctd_ptr<asset::IAssetManager> assetMgr;
-		core::smart_refctd_ptr<system::ILogger> logger;
-		video::IPhysicalDevice* physicalDevice;
-		uint32_t computeFamilyIndex;
-		std::string shaderKey;
-		uint32_t dispatchGroupCount;  // workgroup count = testBatchCount
-		uint32_t samplesPerDispatch;  // dispatchGroupCount * WorkgroupSize * benchIters
-		size_t inputBufferBytes;      // sizeof(InputType) * samplesPerDispatch
-		size_t outputBufferBytes;     // sizeof(ResultType) * samplesPerDispatch
-	};
-
-	void setup(const SetupData& data)
-	{
-		m_device = data.device;
-		m_logger = data.logger;
-		m_dispatchGroupCount = data.dispatchGroupCount;
-
-		// Single cmdbuf holds [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches]
-		// so the driver can pipeline adjacent dispatches and the trailing bench dispatches
-		// aren't measured in a winding-down tail.
-		m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-		if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchmarkCmdbuf))
-			m_logger->log("CSamplerBenchmark: failed to create benchmark cmdbuf", system::ILogger::ELL_ERROR);
-
-		// Timestamp query pool (2 queries: before and after)
-		{
-			video::IQueryPool::SCreationParams qparams = {};
-			qparams.queryType = video::IQueryPool::TYPE::TIMESTAMP;
-			qparams.queryCount = 2;
-			qparams.pipelineStatisticsFlags = video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
-			m_queryPool = m_device->createQueryPool(qparams);
-			if (!m_queryPool)
-				m_logger->log("CSamplerBenchmark: failed to create query pool", system::ILogger::ELL_ERROR);
-		}
-
-		// Load and compile shader
-		core::smart_refctd_ptr<asset::IShader> shader;
-		{
-			asset::IAssetLoader::SAssetLoadParams lp = {};
-			lp.logger = m_logger.get();
-			lp.workingDirectory = "app_resources";
-			auto bundle = data.assetMgr->getAsset(data.shaderKey, lp);
-			const auto assets = bundle.getContents();
-			if (assets.empty())
-			{
-				m_logger->log("CSamplerBenchmark: failed to load shader", system::ILogger::ELL_ERROR);
-				return;
-			}
-			auto source = asset::IAsset::castDown<asset::IShader>(assets[0]);
-			shader = m_device->compileShader({ source.get() });
-		}
-
-		// Descriptor set layout: binding 0 = input SSBO, binding 1 = output SSBO
-		video::IGPUDescriptorSetLayout::SBinding bindings[2] = {
-			{ .binding = 0, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
-			  .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-			  .stageFlags = ShaderStage::ESS_COMPUTE, .count = 1 },
-			{ .binding = 1, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
-			  .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-			  .stageFlags = ShaderStage::ESS_COMPUTE, .count = 1 }
-		};
-		auto dsLayout = m_device->createDescriptorSetLayout(bindings);
-
-		const asset::SPushConstantRange pcRange = {
-			.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
-			.offset     = 0,
-			.size       = sizeof(SamplerBenchPushConstants),
-		};
-		m_pplnLayout = m_device->createPipelineLayout({&pcRange, 1}, core::smart_refctd_ptr(dsLayout));
-
-		{
-			video::IGPUComputePipeline::SCreationParams pparams = {};
-			pparams.layout = m_pplnLayout.get();
-			pparams.shader.entryPoint = "main";
-			pparams.shader.shader = shader.get();
-         if (m_device->getEnabledFeatures().pipelineExecutableInfo)
-         {
-            pparams.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
-         }
-			if (!m_device->createComputePipelines(nullptr, { &pparams, 1 }, &m_pipeline))
-				m_logger->log("CSamplerBenchmark: failed to create compute pipeline", system::ILogger::ELL_ERROR);
-
-         if (m_device->getEnabledFeatures().pipelineExecutableInfo)
-               m_executableReport = system::to_string(m_pipeline->getExecutableInfo());
-		}
-
-		// Allocate input buffer (device-local VRAM, zero-filled via cmdFillBuffer; correctness
-		// irrelevant for benchmarking but we want deterministic input, not garbage)
-		core::smart_refctd_ptr<video::IGPUBuffer> inputBuf;
-		{
-			video::IGPUBuffer::SCreationParams bparams = {};
-			bparams.size = data.inputBufferBytes;
-			bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_TRANSFER_DST_BIT;
-			inputBuf = m_device->createBuffer(std::move(bparams));
-			video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuf->getMemoryReqs();
-			reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits();
-			m_inputAlloc = m_device->allocate(reqs, inputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE);
-			if (!m_inputAlloc.isValid())
-				m_logger->log("CSamplerBenchmark: failed to allocate input buffer memory", system::ILogger::ELL_ERROR);
-		}
-
-		// Allocate output buffer (device-local VRAM, GPU writes, never read back)
-		core::smart_refctd_ptr<video::IGPUBuffer> outputBuf;
-		{
-			video::IGPUBuffer::SCreationParams bparams = {};
-			bparams.size = data.outputBufferBytes;
-			bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			outputBuf = m_device->createBuffer(std::move(bparams));
-			video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuf->getMemoryReqs();
-			reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits();
-			m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-			if (!m_outputAlloc.isValid())
-				m_logger->log("CSamplerBenchmark: failed to allocate output buffer memory", system::ILogger::ELL_ERROR);
-			m_outputAddress = outputBuf->getDeviceAddress();
-		}
-
-		// Zero-fill the input buffer once on the GPU
-		{
-			core::smart_refctd_ptr<video::IGPUCommandBuffer> initCmdbuf;
-			m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &initCmdbuf);
-			initCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			const asset::SBufferRange<video::IGPUBuffer> range = { .offset = 0, .size = data.inputBufferBytes, .buffer = inputBuf };
-			initCmdbuf->fillBuffer(range, 0u);
-			initCmdbuf->end();
-
-			auto queue = m_device->getQueue(data.computeFamilyIndex, 0);
-			const video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = { {.cmdbuf = initCmdbuf.get()} };
-			video::IQueue::SSubmitInfo submit = {};
-			submit.commandBuffers = cmds;
-			queue->submit({&submit, 1u});
-			m_device->waitIdle();
-		}
-
-		// Descriptor set: bind both buffers
-		auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 });
-		m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout));
-		{
-			video::IGPUDescriptorSet::SDescriptorInfo info[2];
-			info[0].desc = core::smart_refctd_ptr(inputBuf);
-			info[0].info.buffer = { .offset = 0, .size = data.inputBufferBytes };
-			info[1].desc = core::smart_refctd_ptr(outputBuf);
-			info[1].info.buffer = { .offset = 0, .size = data.outputBufferBytes };
-			video::IGPUDescriptorSet::SWriteDescriptorSet writes[2] = {
-				{ .dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &info[0] },
-				{ .dstSet = m_ds.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &info[1] }
-			};
-			m_device->updateDescriptorSets(writes, {});
-		}
-
-		m_queue = m_device->getQueue(data.computeFamilyIndex, 0);
-		m_samplesPerDispatch = data.samplesPerDispatch;
-		m_physicalDevice = data.physicalDevice;
-	}
-
-	void logPipelineReport(const std::string& name) const
+   public:
+   struct SetupData : GPUBenchmark::SetupData
    {
-		if (!m_executableReport.empty())
-			m_logger->log("%s Sampler Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, name.c_str(), m_executableReport.c_str());
-	}
-
-	void run(const std::string& samplerName, const std::string& mode, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000)
-	{
-		m_device->waitIdle();
-
-		const uint32_t cooldownIterations = warmupIterations;
-
-		m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
-		m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-		m_benchmarkCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
-		m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get());
-		m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
-		{
-			SamplerBenchPushConstants pc = { .outputAddress = m_outputAddress };
-			m_benchmarkCmdbuf->pushConstants(m_pplnLayout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
-		}
-		for (uint32_t i = 0u; i < warmupIterations; ++i)
-			m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
-		m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
-		for (uint32_t i = 0u; i < benchmarkIterations; ++i)
-			m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
-		m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
-		for (uint32_t i = 0u; i < cooldownIterations; ++i)
-			m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
-		m_benchmarkCmdbuf->end();
+      core::smart_refctd_ptr<asset::IAssetManager> assetMgr;
+      GPUBenchmarkHelper::ShaderVariant            variant; // precompiled key OR source path + defines
+      size_t                                       outputBufferBytes; // sizeof(uint32_t) * threadsPerDispatch
+   };
 
-		auto semaphore = m_device->createSemaphore(0u);
-		const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} };
-		const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = {
-			{.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}
-		};
-		video::IQueue::SSubmitInfo submit = {};
-		submit.commandBuffers = benchCmds;
-		submit.signalSemaphores = signalSem;
-		m_queue->submit({&submit, 1u});
-
-		m_device->waitIdle();
-
-		uint64_t timestamps[2] = {};
-		const auto flags = core::bitflag(video::IQueryPool::RESULTS_FLAGS::_64_BIT) |
-		                   core::bitflag(video::IQueryPool::RESULTS_FLAGS::WAIT_BIT);
-		m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags);
-
-		const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds);
-		const float64_t elapsed_ns      = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod;
-		const uint64_t total_samples    = uint64_t(benchmarkIterations) * uint64_t(m_samplesPerDispatch);
-		const float64_t ps_per_sample   = elapsed_ns * 1e3 / float64_t(total_samples);
-		const float64_t gsamples_per_s  = float64_t(total_samples) / elapsed_ns;
-		const float64_t elapsed_ms      = elapsed_ns * 1e-6;
+   CSamplerBenchmark(Aggregator& aggregator, const SetupData& data)
+      : GPUBenchmark(aggregator, data) // slicing-copy of the GPUBenchmark::SetupData base
+   {
+      auto bda        = createBdaOutputBuffer(data.outputBufferBytes);
+      m_outputBuf     = std::move(bda.buf);
+      m_outputAddress = bda.address;
 
-		m_logger->log("[Benchmark] %-28s | %-38s | %12.3f | %12.3f | %12.3f",
-			system::ILogger::ELL_PERFORMANCE,
-			samplerName.c_str(), mode.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms);
-	}
+      m_pipelineIdx = createPipeline(data.variant, data.assetMgr, sizeof(SamplerBenchPushConstants), joinName(data.name));
+   }
 
-private:
-	core::smart_refctd_ptr<video::ILogicalDevice>       m_device;
-	core::smart_refctd_ptr<system::ILogger>             m_logger;
-	core::smart_refctd_ptr<video::IGPUCommandPool>      m_cmdpool;
-	core::smart_refctd_ptr<video::IGPUCommandBuffer>    m_benchmarkCmdbuf;
-	core::smart_refctd_ptr<video::IQueryPool>           m_queryPool;
-	core::smart_refctd_ptr<video::IGPUPipelineLayout>   m_pplnLayout;
-	core::smart_refctd_ptr<video::IGPUComputePipeline>  m_pipeline;
-	core::smart_refctd_ptr<video::IGPUDescriptorSet>    m_ds;
-	video::IDeviceMemoryAllocator::SAllocation          m_inputAlloc  = {};
-	video::IDeviceMemoryAllocator::SAllocation          m_outputAlloc = {};
-	uint64_t                                            m_outputAddress      = 0;
-	video::IQueue*                                      m_queue              = nullptr;
-	video::IPhysicalDevice*                             m_physicalDevice     = nullptr;
-	uint32_t                                            m_dispatchGroupCount = 0;
-	uint32_t                                            m_samplesPerDispatch = 0;
-	std::string                                         m_executableReport;
+   void doRun() override
+   {
+      const PipelineEntry&      pe = m_pipelines[m_pipelineIdx];
+      SamplerBenchPushConstants pc = {};
+      pc.outputAddress             = m_outputAddress;
+
+      const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(),
+         [&](video::IGPUCommandBuffer* cb) { defaultBindAndPush(cb, pe, pc); },
+         [this](video::IGPUCommandBuffer* cb) { defaultDispatch(cb); },
+         samplesForCurrentRow());
+
+      record(m_name, t, pe.stats);
+   }
+
+   private:
+   core::smart_refctd_ptr<video::IGPUBuffer> m_outputBuf;
+   uint64_t                                  m_outputAddress = 0;
+   uint32_t                                  m_pipelineIdx   = 0;
 };
 
 #endif
diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp
index e0248d034..634e84123 100644
--- a/37_HLSLSamplingTests/main.cpp
+++ b/37_HLSLSamplingTests/main.cpp
@@ -1,5 +1,6 @@
 #include <nabla.h>
 
+#include <chrono>
 #include <utility>
 
 #include "nbl/examples/examples.hpp"
@@ -191,149 +192,159 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // ======================================================================
       // GPU throughput benchmarks
       // ======================================================================
-      constexpr uint32_t testBatchCount = 4096;
+      constexpr uint32_t benchWorkgroupsCount = 4096;
       constexpr bool     DoBenchmark    = true;
 
       if constexpr (DoBenchmark)
       {
          constexpr uint32_t benchWorkgroupSize      = WORKGROUP_SIZE;
-         constexpr uint32_t totalThreadsPerDispatch = testBatchCount * benchWorkgroupSize;
+         constexpr uint32_t totalThreadsPerDispatch = benchWorkgroupsCount * benchWorkgroupSize;
          constexpr uint32_t iterationsPerThread     = BENCH_ITERS;
          constexpr uint32_t benchSamplesPerDispatch = totalThreadsPerDispatch * iterationsPerThread;
+         constexpr uint32_t warmupDispatches        = 300;          // unmeasured warmup + cooldown around the timing window
+         constexpr uint64_t targetBudgetMs          = 400;          // wall-clock per row; runTimedBudgeted sizes dispatches
+
+         std::vector<CSamplerBenchmark> benchmarks;
+
+         // Single Aggregator owns results, baselines, formatting, and reporting
+         // for both bench classes. Passed by reference into each bench's ctor.
+         Aggregator agg(m_logger, m_device, m_physicalDevice, getComputeQueue()->getFamilyIndex());
+         const auto cli = agg.applyCli({
+            .argv              = this->argv,
+            .defaultOutputPath = "SamplerBench.json",
+            .appName           = "37_HLSLSamplingTests",
+         });
+
+         // One context for the whole sampler-bench span: drives both the per-bench
+         // shape/budget and the banner that runSessionAndReport prints.
+         const RunContext samplerCtx = {
+            .shape          = {
+                       .workgroupSize      = {benchWorkgroupSize, 1u, 1u},
+                       .dispatchGroupCount = {benchWorkgroupsCount, 1u, 1u},
+                       .samplesPerDispatch = benchSamplesPerDispatch,
+            },
+            .targetBudgetMs = targetBudgetMs,
+            .sectionLabel   = "GPU Sampler Benchmarks",
+         };
 
-         struct BenchEntry
+         auto addBench = [&](const std::initializer_list<std::string> name, GPUBenchmarkHelper::ShaderVariant variant, size_t outputSize)
          {
-            CSamplerBenchmark bench;
-            std::string       sampler;
-            std::string       mode;
+            CSamplerBenchmark::SetupData data;
+            data.assetMgr          = m_assetMgr;
+            data.name              = name;
+            data.variant           = std::move(variant);
+            data.outputBufferBytes = outputSize;
+            data.warmupDispatches  = warmupDispatches;
+            data.shape             = samplerCtx.shape;
+            data.targetBudgetMs    = samplerCtx.targetBudgetMs;
+
+            benchmarks.emplace_back(agg, data);
          };
-         std::vector<BenchEntry> benchmarks;
 
-         auto addBench = [&](const char* sampler, const char* mode, const std::string& shaderKey, size_t inputSize, size_t outputSize)
+         // Convenience wrappers so the 35+ existing precompiled-key calls below stay
+         // one line each, and adding a new runtime variant is also a one-liner without
+         // CMake JSON edits. Both go through the same addBench, just construct the
+         // ShaderVariant differently.
+         auto addPrecompiled = [&]<nbl::core::StringLiteral ShaderKey>(std::initializer_list<std::string> name, size_t outputSize)
          {
-            auto& entry   = benchmarks.emplace_back();
-            entry.sampler = sampler;
-            entry.mode    = mode;
-
-            CSamplerBenchmark::SetupData data;
-            data.device             = m_device;
-            data.api                = m_api;
-            data.assetMgr           = m_assetMgr;
-            data.logger             = m_logger;
-            data.physicalDevice     = m_physicalDevice;
-            data.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
-            data.shaderKey          = shaderKey;
-            data.dispatchGroupCount = testBatchCount;
-            data.samplesPerDispatch = benchSamplesPerDispatch;
-            data.inputBufferBytes   = inputSize;
-            data.outputBufferBytes  = outputSize;
-            entry.bench.setup(data);
+            auto shader = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get());
+            addBench(name, GPUBenchmarkHelper::ShaderVariant::Precompiled(std::move(shader)), outputSize);
+         };
+         auto addRuntime = [&](std::initializer_list<std::string> name, const char* sourcePath, std::vector<GPUBenchmarkHelper::ShaderVariant::Define> defines, size_t outputSize)
+         {
+            // Mirror CMake's COMMON_OPTIONS so runtime variants see the same baseline
+            // as precompiled ones.
+            std::vector<GPUBenchmarkHelper::ShaderVariant::Define> all = {
+               {"WORKGROUP_SIZE", std::to_string(WORKGROUP_SIZE)},
+               {"BENCH_ITERS", std::to_string(BENCH_ITERS)},
+            };
+            all.insert(all.end(), std::make_move_iterator(defines.begin()), std::make_move_iterator(defines.end()));
+            addBench(name, GPUBenchmarkHelper::ShaderVariant::FromSource(sourcePath, std::move(all)), outputSize);
          };
 
-         // Bench shaders don't read input (hardcoded values) and write a single uint32_t per thread via RWByteAddressBuffer
+         // Bench shaders don't read input -- output is BDA via push constants.
          if constexpr (true)
          {
-            constexpr size_t benchInputBytes  = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks
             constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch;
-            addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalRectangle", "1:1  (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalRectangle", "1:1  (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalRectangle", "1:1  (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-            addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addPrecompiled.operator()<"linear_bench_1_1">({"Linear", "Linear", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"linear_bench_1_16">({"Linear", "Linear", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"bilinear_bench_1_1">({"Linear", "Bilinear", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"bilinear_bench_1_16">({"Linear", "Bilinear", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"box_muller_transform_bench_1_1">({"Gaussian", "BoxMullerTransform", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"box_muller_transform_bench_1_16">({"Gaussian", "BoxMullerTransform", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"uniform_hemisphere_bench_1_1">({"SphereSampling", "UniformHemisphere", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"uniform_hemisphere_bench_1_16">({"SphereSampling", "UniformHemisphere", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"uniform_sphere_bench_1_1">({"SphereSampling", "UniformSphere", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"uniform_sphere_bench_1_16">({"SphereSampling", "UniformSphere", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_hemisphere_bench_1_1">({"SphereSampling", "ProjectedHemisphere", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_hemisphere_bench_1_16">({"SphereSampling", "ProjectedHemisphere", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_sphere_bench_1_1">({"SphereSampling", "ProjectedSphere", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_sphere_bench_1_16">({"SphereSampling", "ProjectedSphere", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"concentric_mapping_bench_1_1">({"DiskMappers", "ConcentricMapping", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"concentric_mapping_bench_1_16">({"DiskMappers", "ConcentricMapping", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"polar_mapping_bench_1_1">({"DiskMappers", "PolarMapping", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"polar_mapping_bench_1_16">({"DiskMappers", "PolarMapping", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_rectangle_bench_1_1_shape_observer">({"SphShapes", "SphRect", "1:1", "shape,observer"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_rectangle_bench_1_1_sa_extents">({"SphShapes", "SphRect", "1:1", "sa,extents"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_rectangle_bench_1_1_r0_extents">({"SphShapes", "SphRect", "1:1", "r0,extents"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_rectangle_bench_1_16_shape_observer">({"SphShapes", "SphRect", "1:16", "shape,observer"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_rectangle_bench_1_16_sa_extents">({"SphShapes", "SphRect", "1:16", "sa,extents"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_rectangle_bench_1_16_r0_extents">({"SphShapes", "SphRect", "1:16", "r0,extents"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_rectangle_bench_create_only_shape_observer">({"SphShapes", "SphRect", "create-only", "shape,observer"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_rectangle_bench_create_only_sa_extents">({"SphShapes", "SphRect", "create-only", "sa,extents"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_rectangle_bench_create_only_r0_extents">({"SphShapes", "SphRect", "create-only", "r0,extents"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_spherical_rectangle_bench_1_1">({"SphShapes", "ProjSphRect", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_spherical_rectangle_bench_1_16">({"SphShapes", "ProjSphRect", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_spherical_rectangle_bench_create_only">({"SphShapes", "ProjSphRect", "create-only"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_triangle_bench_1_1">({"SphShapes", "SphTri", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_triangle_bench_1_16">({"SphShapes", "SphTri", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"spherical_triangle_bench_create_only">({"SphShapes", "SphTri", "create-only"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_spherical_triangle_bench_1_1">({"SphShapes", "ProjSphTri", "1:1"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_spherical_triangle_bench_1_16">({"SphShapes", "ProjSphTri", "1:16"}, benchOutputBytes);
+            addPrecompiled.operator()<"projected_spherical_triangle_bench_create_only">({"SphShapes", "ProjSphTri", "create-only"}, benchOutputBytes);
+            // ---- Runtime-compiled demo variants (no CMake JSON edit needed) ----
+            // Same .hlsl source as the precompiled "linear_bench_1_*" entries, but with
+            // a `BENCH_SAMPLES_PER_CREATE` value that has no JSON entry. Add as many
+            // here as you want -- each is a one-liner, no reconfigure required.
+            //addRuntime({"Linear", "Linear", "1:4 (rt)"}, "shaders/linear_test.comp.hlsl", {{"BENCH_SAMPLES_PER_CREATE", "4"}}, benchOutputBytes);
+            //addRuntime({"Linear", "Linear", "1:8 (rt)"}, "shaders/linear_test.comp.hlsl", {{"BENCH_SAMPLES_PER_CREATE", "8"}}, benchOutputBytes);
          }
 
-         // Print all pipeline reports first
-         for (auto& entry : benchmarks)
-            entry.bench.logPipelineReport(entry.sampler + " (" + entry.mode + ")");
-
          // Discrete sampler benchmark: alias table vs cumulative probability (BDA)
          {
             CDiscreteSamplerBenchmark::SetupData dsData;
-            dsData.device                    = m_device;
-            dsData.api                       = m_api;
-            dsData.assetMgr                  = m_assetMgr;
-            dsData.logger                    = m_logger;
-            dsData.physicalDevice            = m_physicalDevice;
-            dsData.computeFamilyIndex        = getComputeQueue()->getFamilyIndex();
-            dsData.packedAliasAShaderKey     = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_bench">(m_device.get());
-            dsData.packedAliasBShaderKey     = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_bench">(m_device.get());
-            dsData.cumProbShaderKey          = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get());
-            dsData.cumProbYoloShaderKey      = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get());
-            dsData.cumProbEytzingerShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_eytzinger_bench">(m_device.get());
-            dsData.dispatchGroupCount        = testBatchCount;
-
-            CDiscreteSamplerBenchmark discreteBench;
-            discreteBench.setup(dsData);
-
-            // Then run all benchmarks here so the reports are at the top of the log, followed by timings
-            {
-               constexpr uint32_t warmupDispatches = 300;
-               constexpr uint32_t benchDispatches  = 1000;
-               m_logger->log("=== GPU Sampler Benchmarks (%u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===",
-                  ILogger::ELL_PERFORMANCE, benchDispatches, totalThreadsPerDispatch, iterationsPerThread);
-               m_logger->log("            %-28s | %-38s | %12s | %12s | %12s",
-                  ILogger::ELL_PERFORMANCE, "Sampler", "Mode", "ps/sample", "GSamples/s", "ms total");
-               for (auto& entry : benchmarks)
-                  entry.bench.run(entry.sampler, entry.mode, warmupDispatches, benchDispatches);
-            }
-
-            {
-               // If you change something here, better change kBenchTable below too
-               const std::vector<uint32_t> discreteSizes = {
-                  2u, 4u, 8u, 16u, 32u, 64u, 100u, 128u, 256u, 400u, 512u, 1024u, 2048u, 2049u, 3000u, 4096u, 7000u, 8192u, 10'000u, 16'384u, 32'768u,
-                  65'536u, 131'072u, 262'144u, 524'288u, 1'000'000u, 1'048'576u, 2'097'152u, 16'777'216u, 20'971'520u, 25'165'824u, 33'554'432u};
-
-               // Per-N dispatch counts calibrated from a prior measured run
-               auto dispatchScheduler = [](uint32_t N) -> CDiscreteSamplerBenchmark::DispatchCounts
-               {
-                  static constexpr std::pair<uint32_t, uint32_t> kBenchTable[] = {
-                     {2u, 7180u}, {4u, 5993u}, {8u, 4490u}, {16u, 4099u}, {32u, 3110u}, {64u, 3026u}, {100u, 2507u}, {128u, 2498u}, {256u, 2477u}, {400u, 2001u},
-                     {512u, 1827u}, {1024u, 1372u}, {2048u, 1010u}, {2049u, 1010u}, {3000u, 859u}, {4096u, 962u}, {7000u, 742u}, {8192u, 833u}, {10'000u, 590u}, {16'384u, 786u}, {32'768u, 608u},
-                     {65'536u, 283u}, {131'072u, 174u}, {262'144u, 160u}, {524'288u, 133u}, {1'000'000u, 77u}, {1'048'576u, 128u}, {2'097'152u, 106u}, {16'777'216u, 17u}, {20'971'520u, 17u}, {25'165'824u, 16u}, {33'554'432u, 14u}};
-                  uint32_t bench = 10u; // fallback for any N not in the table
-                  for (const auto& e : kBenchTable)
-                     if (e.first == N)
-                     {
-                        bench = e.second;
-                        break;
-                     }
-                  const uint32_t warmup = std::max(5u, bench / 10u);
-                  return {warmup, bench};
-               };
-
-               discreteBench.runSweep(discreteSizes, dispatchScheduler);
-            }
+            dsData.assetMgr                = m_assetMgr;
+            dsData.packedAliasAVariant     = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_bench">(m_device.get()));
+            dsData.packedAliasBVariant     = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_bench">(m_device.get()));
+            dsData.cumProbVariant          = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()));
+            dsData.cumProbYoloVariant      = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get()));
+            dsData.cumProbEytzingerVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_eytzinger_bench">(m_device.get()));
+            dsData.dispatchGroupCount      = {benchWorkgroupsCount, 1u, 1u};
+            dsData.targetBudgetMs          = targetBudgetMs;
+
+            // Just the N values now -- runTimedBudgeted sizes dispatches per
+            // row to hit the budget. The old per-N tuning table is gone.
+            static constexpr uint32_t kSweepNs[] = {
+               2u, 4u, 8u, 16u, 32u, 64u, 100u, 128u, 256u, 400u,
+               512u, 1024u, 2048u, 2049u, 3000u, 4096u, 7000u, 8192u, 10'000u, 16'384u, 32'768u,
+               65'536u, 131'072u, 262'144u, 524'288u, 1'000'000u, 1'048'576u, 2'097'152u, 16'777'216u, 20'971'520u, 25'165'824u, 33'554'432u};
+            dsData.sweepNs                 = kSweepNs;
+
+            CDiscreteSamplerBenchmark discreteBench(agg, dsData);
+
+            const RunContext discreteCtx = {
+               .shape          = CDiscreteSamplerBenchmark::shapeFor(dsData),
+               .targetBudgetMs = targetBudgetMs,
+               .sectionLabel   = "Discrete Sampler Sweep",
+            };
+
+            // Single call. Each span contributes its own focus rows first, then
+            // every span's unfocused rows -- the aggregator iterates both packs
+            // in each phase. CDiscrete's overridden run() does per-row filtering
+            // against cli.focusVariants since its rows aren't a flat list.
+            agg.runSessionAndReport(
+               Aggregator::makeSpan(benchmarks,    samplerCtx),
+               Aggregator::makeSpan(discreteBench, discreteCtx));
          }
       }
 
@@ -341,9 +352,9 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // Runtime CPU/GPU comparison tests using ITester harness
       // ================================================================
       bool pass = true;
-
+      constexpr uint32_t testWorkgroupsCount = 4096;
       // generic lambda to run a GPU sampler test
-      auto runSamplerTest = [&]<typename Tester>(const char* testName, auto spirvKey, const char* logFile)
+      auto runSamplerTest = [&]<typename Tester, core::StringLiteral ShaderKey>(const char* testName, const char* logFile)
       {
          m_logger->log("Running %s tests...", ILogger::ELL_INFO, testName);
          typename Tester::PipelineSetupData data;
@@ -353,8 +364,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          data.logger             = m_logger;
          data.physicalDevice     = m_physicalDevice;
          data.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
-         data.shaderKey          = std::move(spirvKey);
-         Tester tester(testBatchCount);
+         data.shaderKey          = std::move(nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get()));
+         Tester tester(testWorkgroupsCount);
          tester.setupPipeline(data);
          pass &= tester.performTestsAndVerifyResults(logFile);
       };
@@ -362,22 +373,22 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // --- Sampler tests ---
       if constexpr (true)
       {
-         runSamplerTest.operator()<CLinearTester>("Linear sampler", nbl::this_example::builtin::build::get_spirv_key<"linear_test">(m_device.get()), "LinearTestLog.txt");
-         runSamplerTest.operator()<CBilinearTester>("Bilinear sampler", nbl::this_example::builtin::build::get_spirv_key<"bilinear_test">(m_device.get()), "BilinearTestLog.txt");
-         runSamplerTest.operator()<CUniformHemisphereTester>("UniformHemisphere sampler", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_test">(m_device.get()), "UniformHemisphereTestLog.txt");
-         runSamplerTest.operator()<CUniformSphereTester>("UniformSphere sampler", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_test">(m_device.get()), "UniformSphereTestLog.txt");
-         runSamplerTest.operator()<CProjectedHemisphereTester>("ProjectedHemisphere sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_test">(m_device.get()), "ProjectedHemisphereTestLog.txt");
-         runSamplerTest.operator()<CProjectedSphereTester>("ProjectedSphere sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_test">(m_device.get()), "ProjectedSphereTestLog.txt");
-         runSamplerTest.operator()<CConcentricMappingTester>("ConcentricMapping sampler", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_test">(m_device.get()), "ConcentricMappingTestLog.txt");
-         runSamplerTest.operator()<CPolarMappingTester>("PolarMapping sampler", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_test">(m_device.get()), "PolarMappingTestLog.txt");
-         runSamplerTest.operator()<CBoxMullerTransformTester>("BoxMullerTransform sampler", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_test">(m_device.get()), "BoxMullerTransformTestLog.txt");
-         runSamplerTest.operator()<CSphericalTriangleTester>("SphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle">(m_device.get()), "SphericalTriangleTestLog.txt");
-         runSamplerTest.operator()<CProjectedSphericalTriangleTester>("ProjectedSphericalTriangle sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_test">(m_device.get()), "ProjectedSphericalTriangleTestLog.txt");
-         runSamplerTest.operator()<CSphericalRectangleTester>("SphericalRectangle sampler", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_test">(m_device.get()), "SphericalRectangleTestLog.txt");
-         runSamplerTest.operator()<CProjectedSphericalRectangleTester>("ProjectedSphericalRectangle sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_test">(m_device.get()), "ProjectedSphericalRectangleTestLog.txt");
+         runSamplerTest.operator()<CLinearTester, "linear_test">("Linear sampler", "LinearTestLog.txt");
+         runSamplerTest.operator()<CBilinearTester, "bilinear_test">("Bilinear sampler", "BilinearTestLog.txt");
+         runSamplerTest.operator()<CUniformHemisphereTester, "uniform_hemisphere_test">("UniformHemisphere sampler", "UniformHemisphereTestLog.txt");
+         runSamplerTest.operator()<CUniformSphereTester, "uniform_sphere_test">("UniformSphere sampler", "UniformSphereTestLog.txt");
+         runSamplerTest.operator()<CProjectedHemisphereTester, "projected_hemisphere_test">("ProjectedHemisphere sampler", "ProjectedHemisphereTestLog.txt");
+         runSamplerTest.operator()<CProjectedSphereTester, "projected_sphere_test">("ProjectedSphere sampler", "ProjectedSphereTestLog.txt");
+         runSamplerTest.operator()<CConcentricMappingTester, "concentric_mapping_test">("ConcentricMapping sampler", "ConcentricMappingTestLog.txt");
+         runSamplerTest.operator()<CPolarMappingTester, "polar_mapping_test">("PolarMapping sampler", "PolarMappingTestLog.txt");
+         runSamplerTest.operator()<CBoxMullerTransformTester, "box_muller_transform_test">("BoxMullerTransform sampler", "BoxMullerTransformTestLog.txt");
+         runSamplerTest.operator()<CSphericalTriangleTester, "spherical_triangle">("SphericalTriangle", "SphericalTriangleTestLog.txt");
+         runSamplerTest.operator()<CProjectedSphericalTriangleTester, "projected_spherical_triangle_test">("ProjectedSphericalTriangle sampler", "ProjectedSphericalTriangleTestLog.txt");
+         runSamplerTest.operator()<CSphericalRectangleTester, "spherical_rectangle_test">("SphericalRectangle sampler", "SphericalRectangleTestLog.txt");
+         runSamplerTest.operator()<CProjectedSphericalRectangleTester, "projected_spherical_rectangle_test">("ProjectedSphericalRectangle sampler", "ProjectedSphericalRectangleTestLog.txt");
       }
 
-      if constexpr (DoBenchmark)
+      if constexpr (true)
       {
          // --- Discrete table construction (CPU) ---
          {
@@ -387,9 +398,9 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          }
 
          // --- GPU table sampler tests ---
-         runSamplerTest.operator()<CPackedAliasAGPUTester>("PackedAliasA GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_test">(m_device.get()), "PackedAliasATestLog.txt");
-         runSamplerTest.operator()<CPackedAliasBGPUTester>("PackedAliasB GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_test">(m_device.get()), "PackedAliasBTestLog.txt");
-         runSamplerTest.operator()<CCumulativeProbabilityGPUTester>("CumulativeProbability GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_test">(m_device.get()), "CumulativeProbabilityTestLog.txt");
+         runSamplerTest.operator()<CPackedAliasAGPUTester, "packed_alias_a_test">("PackedAliasA GPU sampler", "PackedAliasATestLog.txt");
+         runSamplerTest.operator()<CPackedAliasBGPUTester, "packed_alias_b_test">("PackedAliasB GPU sampler", "PackedAliasBTestLog.txt");
+         runSamplerTest.operator()<CCumulativeProbabilityGPUTester, "cumulative_probability_test">("CumulativeProbability GPU sampler", "CumulativeProbabilityTestLog.txt");
       }
       logJacobianSkipCounts(m_logger.get());
       if (pass)
@@ -405,61 +416,33 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          m_logger->log("Running sampler property tests (CPU)...", ILogger::ELL_INFO);
          m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING);
 
-         CSamplerPropertyTester<LinearPropertyConfig> linearProps(m_logger.get());
-         pass &= linearProps.run();
-
-         CSamplerPropertyTester<BilinearPropertyConfig> bilinearProps(m_logger.get());
-         pass &= bilinearProps.run();
-
-         CSamplerPropertyTester<UniformHemispherePropertyConfig> uniformHemiProps(m_logger.get());
-         pass &= uniformHemiProps.run();
-
-         CSamplerPropertyTester<UniformSpherePropertyConfig> uniformSphereProps(m_logger.get());
-         pass &= uniformSphereProps.run();
-
-         CSamplerPropertyTester<ProjectedHemispherePropertyConfig> projHemiProps(m_logger.get());
-         pass &= projHemiProps.run();
-
-         CSamplerPropertyTester<ProjectedSpherePropertyConfig> projSphereProps(m_logger.get());
-         pass &= projSphereProps.run();
-
-         CSamplerPropertyTester<ConcentricMappingPropertyConfig> concentricProps(m_logger.get());
-         pass &= concentricProps.run();
-
-         CSamplerPropertyTester<PolarMappingPropertyConfig> polarProps(m_logger.get());
-         pass &= polarProps.run();
-
-         CSamplerPropertyTester<BoxMullerTransformPropertyConfig> boxMullerProps(m_logger.get());
-         pass &= boxMullerProps.run();
-
-         CSamplerPropertyTester<SphericalTrianglePropertyConfig> sphTriProps(m_logger.get());
-         pass &= sphTriProps.run();
-
-         CSamplerPropertyTester<ProjectedSphericalTrianglePropertyConfig> projSphTriProps(m_logger.get());
-         pass &= projSphTriProps.run();
-
-         CSamplerPropertyTester<SphericalRectanglePropertyConfig> sphRectProps(m_logger.get());
-         pass &= sphRectProps.run();
+         auto check = [&]<typename Config>()
+         {
+            pass &= CSamplerPropertyTester<Config>(m_logger.get()).run();
+         };
 
-         CSamplerPropertyTester<ProjectedSphericalRectanglePropertyConfig> projSphRectProps(m_logger.get());
-         pass &= projSphRectProps.run();
+         check.operator()<LinearPropertyConfig>();
+         check.operator()<BilinearPropertyConfig>();
+         check.operator()<UniformHemispherePropertyConfig>();
+         check.operator()<UniformSpherePropertyConfig>();
+         check.operator()<ProjectedHemispherePropertyConfig>();
+         check.operator()<ProjectedSpherePropertyConfig>();
+         check.operator()<ConcentricMappingPropertyConfig>();
+         check.operator()<PolarMappingPropertyConfig>();
+         check.operator()<BoxMullerTransformPropertyConfig>();
+         check.operator()<SphericalTrianglePropertyConfig>();
+         check.operator()<ProjectedSphericalTrianglePropertyConfig>();
+         check.operator()<SphericalRectanglePropertyConfig>();
+         check.operator()<ProjectedSphericalRectanglePropertyConfig>();
 
          // Stress tests: extreme coefficient ratios
-         CSamplerPropertyTester<LinearStressConfig> linearStress(m_logger.get());
-         pass &= linearStress.run();
-
-         CSamplerPropertyTester<BilinearStressConfig> bilinearStress(m_logger.get());
-         pass &= bilinearStress.run();
-
-         CSamplerPropertyTester<BilinearPSTPatternConfig> bilinearPST(m_logger.get());
-         pass &= bilinearPST.run();
-
-         CSamplerPropertyTester<SphericalTriangleStressConfig> sphTriStress(m_logger.get());
-         pass &= sphTriStress.run();
+         check.operator()<LinearStressConfig>();
+         check.operator()<BilinearStressConfig>();
+         check.operator()<BilinearPSTPatternConfig>();
+         check.operator()<SphericalTriangleStressConfig>();
 
          // Grazing angle tests
-         CSamplerPropertyTester<ProjectedSphericalTriangleGrazingConfig> grazingProps(m_logger.get());
-         pass &= grazingProps.run();
+         check.operator()<ProjectedSphericalTriangleGrazingConfig>();
 
          if (pass)
             m_logger->log("All sampler property tests PASSED.", ILogger::ELL_INFO);
@@ -475,23 +458,17 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          m_logger->log("Running geometry tests (CPU)...", ILogger::ELL_INFO);
          m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING);
 
-         CSolidAngleAccuracyTester solidAngleTester(m_logger.get());
-         pass &= solidAngleTester.run();
-
-         CSphericalTriangleGenerateTester sphTriGenTester(m_logger.get());
-         pass &= sphTriGenTester.run();
-
-         CSphericalRectangleGenerateTester sphRectGenTester(m_logger.get());
-         pass &= sphRectGenTester.run();
-
-         CProjectedSphericalRectangleGenerateTester projRectGenTester(m_logger.get());
-         pass &= projRectGenTester.run();
-
-         CProjectedSphericalRectangleGeometricTester projRectGeoTester(m_logger.get());
-         pass &= projRectGeoTester.run();
+         auto check = [&]<typename Tester>()
+         {
+            pass &= Tester(m_logger.get()).run();
+         };
 
-         CProjectedSphericalTriangleGeometricTester pstTester(m_logger.get());
-         pass &= pstTester.run();
+         check.template operator()<CSolidAngleAccuracyTester>();
+         check.template operator()<CSphericalTriangleGenerateTester>();
+         check.template operator()<CSphericalRectangleGenerateTester>();
+         check.template operator()<CProjectedSphericalRectangleGenerateTester>();
+         check.template operator()<CProjectedSphericalRectangleGeometricTester>();
+         check.template operator()<CProjectedSphericalTriangleGeometricTester>();
 
          if (pass)
             m_logger->log("All geometry tests PASSED.", ILogger::ELL_INFO);
diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp
index 8329c03b0..352e4e61f 100644
--- a/64_EmulatedFloatTest/main.cpp
+++ b/64_EmulatedFloatTest/main.cpp
@@ -6,6 +6,8 @@
 #include "nbl/examples/examples.hpp"
 
 #include <nabla.h>
+#include <array>
+#include <span>
 #include <iostream>
 #include <cstdio>
 #include <assert.h>
@@ -17,6 +19,8 @@
 
 #include <nbl\builtin\hlsl\math\quadrature\gauss_legendre\gauss_legendre.hlsl>
 
+#include "nbl/examples/Benchmark/IBenchmark.h"
+#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h"
 
 using namespace nbl::core;
 using namespace nbl::hlsl;
@@ -26,1136 +30,1029 @@ using namespace nbl::video;
 using namespace nbl::application_templates;
 using namespace nbl::examples;
 
-constexpr bool DoTests = true;
+constexpr bool DoTests     = true;
 constexpr bool DoBenchmark = true;
 
+// One row per EF64_BENCHMARK_MODE. Each instance owns its own write-sink
+// buffer + descriptor set; the framework's GPUBenchmarkHelper handles
+// cmdbuf / queryPool / pipeline-stats capture / runTimed timing, IBenchmark
+// routes the result through the Aggregator. The shader binds an SSBO at
+// set 0 / binding 0, so we pass an explicit dsLayout to createPipeline.
+class CEF64Benchmark : public GPUBenchmark
+{
+   public:
+   static constexpr const char* kSectionLabel = "EF64 Benchmarks";
+
+   struct SetupData
+   {
+      smart_refctd_ptr<IAssetManager>     assetMgr;
+      core::vector<core::string>          name; // hierarchical row name
+      EF64_BENCHMARK_MODE                 mode; // pushed each run() via PC
+      GPUBenchmarkHelper::ShaderVariant   variant; // precompiled "benchmark" SPIRV
+      uint32_t                            warmupDispatches;
+      uint64_t                            targetBudgetMs;
+   };
+
+   // Shape is fixed by the BENCHMARK_WORKGROUP_* macros; expose it so the
+   // caller uses the same shape both to construct the bench and to build the
+   // RunContext for its span.
+   static WorkloadShape shape()
+   {
+      const hlsl::uint32_t3 wg = {
+         BENCHMARK_WORKGROUP_DIMENSION_SIZE_X,
+         BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y,
+         BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z};
+      const hlsl::uint32_t3 dgc = {BENCHMARK_WORKGROUP_COUNT, 1u, 1u};
+      // Shader writes one float64 per thread per dispatch; "sample" == "thread output".
+      const uint64_t samplesPerDispatch = uint64_t(dgc.x) * dgc.y * dgc.z * wg.x * wg.y * wg.z;
+      return {.workgroupSize = wg, .dispatchGroupCount = dgc, .samplesPerDispatch = samplesPerDispatch};
+   }
+
+   CEF64Benchmark(Aggregator& aggregator, const SetupData& data)
+      : GPUBenchmark(aggregator, GPUBenchmark::SetupData{
+                                    .name             = data.name,
+                                    .warmupDispatches = data.warmupDispatches,
+                                    .shape            = shape(),
+                                    .targetBudgetMs   = data.targetBudgetMs,
+                                 })
+      , m_mode(data.mode)
+   {
+      // Buffer the shader writes to (descriptor-bound; not BDA). Sized for one
+      // float64 per thread; the GPU never reads it back to host.
+      m_buffer = createOutputBuffer(getShape().samplesPerDispatch * sizeof(float64_t));
+
+      // One SSBO at set 0 / binding 0. createSingleBindingDS wires the
+      // layout + pool + DS + write descriptor in one call.
+      auto ds       = createSingleBindingDS(m_buffer);
+      m_dsLayout    = std::move(ds.layout);
+      m_ds          = std::move(ds.set);
+      m_pipelineIdx = createPipeline(data.variant, data.assetMgr, sizeof(BenchmarkPushConstants), joinName(data.name), m_dsLayout);
+   }
+
+   void doRun() override
+   {
+      const PipelineEntry&   pe = m_pipelines[m_pipelineIdx];
+      BenchmarkPushConstants pc = {};
+      pc.benchmarkMode          = m_mode;
+
+      const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(),
+         [&](IGPUCommandBuffer* cb)
+         {
+            cb->bindDescriptorSets(EPBP_COMPUTE, pe.layout.get(), 0, 1, &m_ds.get());
+            bindAndPush(cb, pe, pc);
+         },
+         [this](IGPUCommandBuffer* cb) { dispatch(cb); },
+         samplesForCurrentRow());
+
+      record(m_name, t, pe.stats);
+   }
+
+   private:
+   EF64_BENCHMARK_MODE                       m_mode = EF64_BENCHMARK_MODE::NATIVE;
+   smart_refctd_ptr<IGPUBuffer>              m_buffer;
+   smart_refctd_ptr<IGPUDescriptorSetLayout> m_dsLayout;
+   smart_refctd_ptr<IGPUDescriptorSet>       m_ds;
+   uint32_t                                  m_pipelineIdx = 0;
+};
+
 class CompatibilityTest final : public MonoDeviceApplication, public BuiltinResourcesApplication
 {
-    using device_base_t = MonoDeviceApplication;
-    using asset_base_t = BuiltinResourcesApplication;
-public:
-    CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
-        IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
-
-    virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
-    {
-        auto retval = device_base_t::getPreferredDeviceFeatures();
-        retval.pipelineExecutableInfo = true;
-        return retval;
-    }
-
-    bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-    {
-        // since emulated_float64_t rounds to zero
-        std::fesetround(FE_TOWARDZERO);
-
-        if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-            return false;
-        if (!asset_base_t::onAppInitialized(std::move(system)))
-            return false;
-
-        return true;
-    }
-
-    void onAppTerminated_impl() override
-    {
-        m_device->waitIdle();
-    }
-
-    void workLoopBody() override
-    {
-        if constexpr (DoTests)
-        {
-            emulated_float64_tests();
-        }
-        if constexpr (DoBenchmark)
-        {
-            EF64Benchmark benchmark(*this);
-            benchmark.run();
-        }
-
-        m_keepRunning = false;
-    }
-
-    bool keepRunning() override
-    {
-        return m_keepRunning;
-    }
-
-
-private:
-
-    bool m_keepRunning = true;
-
-    constexpr static inline uint32_t EmulatedFloat64TestIterations = 1000u;
-    
-    enum class EmulatedFloatTestDevice
-    {
-        CPU,
-        GPU
-    };
-
-    template<bool FastMath, bool FlushDenormToZero, EmulatedFloatTestDevice Device>
-    bool compareEmulatedFloat64TestValues(const TestValues<FastMath, FlushDenormToZero>& expectedValues, const TestValues<FastMath, FlushDenormToZero>& testValues)
-    {
-        bool success = true;
-
-        auto printOnFailure = [this](EmulatedFloatTestDevice device)
-        {
-            std::string errorMsgPrefix = "";
-            if (device == EmulatedFloatTestDevice::CPU)
-                errorMsgPrefix = "CPU test fail:";
-            else
-                errorMsgPrefix = "GPU test fail:";
-
-            m_logger->log("%s", ILogger::ELL_ERROR, errorMsgPrefix.c_str());
-            m_logFile << errorMsgPrefix << '\n';
-        };
-
-        auto printOnArithmeticFailure = [this](const char* valName, uint64_t expectedValue, uint64_t testValue, uint64_t a, uint64_t b)
-        {
-            double expectedAsDouble = reinterpret_cast<double&>(expectedValue);
-            double testAsDouble = reinterpret_cast<double&>(testValue);
-            double error = std::abs(expectedAsDouble - testAsDouble);
-
-            std::stringstream ss;
-            ss << "for input values: A = " << reinterpret_cast<double&>(a) << " B = " << reinterpret_cast<double&>(b) << '\n';
-            ss << valName << " not equal!";
-            ss << "\nexpected value: " << std::fixed << std::setprecision(20) << expectedAsDouble;
-            ss << "\ntest value:     " << std::fixed << std::setprecision(20) << testAsDouble;
-            ss << "\nerror = " << error << '\n';
-            ss << "bit representations: \n";
-            ss << "seeeeeeeeeeemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n";
-            ss << std::bitset<64>(expectedValue) << " - expectedValue bit pattern\n";
-            ss << std::bitset<64>(testValue) << " - testValue bit pattern \n";
-
-            m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str());
-            m_logFile << ss.str() << '\n';
-
-            //std::cout << "ULP error: " << std::max(expectedValue, testValue) - std::min(expectedValue, testValue) << "\n\n";
-
-        };
-
-        auto calcULPError = [](emulated_float64_t::storage_t expectedValue, emulated_float64_t::storage_t testValue)
-        {
-            return std::max(expectedValue, testValue) - std::min(expectedValue, testValue);
-        };
-
-        auto printOnComparisonFailure = [this](const char* valName, int expectedValue, int testValue, double a, double b)
-        {
-            std::string inputValuesStr = std::string("for input values: A = ") + std::to_string(a) + std::string(" B = ") + std::to_string(b);
-
-            m_logger->log("%s", ILogger::ELL_ERROR, inputValuesStr.c_str());
-            m_logFile << inputValuesStr << '\n';
-
-            std::stringstream ss;
-            ss << valName << " not equal!";
-            ss << "\nexpected value: " << std::boolalpha << bool(expectedValue);
-            ss << "\ntest value: " << std::boolalpha << bool(testValue);
-
-            m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str());
-            m_logFile << ss.str() << '\n';
-        };
-
-        if (calcULPError(expectedValues.int32CreateVal, testValues.int32CreateVal) > 1u)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("int32CreateVal", expectedValues.int32CreateVal, testValues.int32CreateVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (calcULPError(expectedValues.int64CreateVal, testValues.int64CreateVal) > 1u)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("int64CreateVal", expectedValues.int64CreateVal, testValues.int64CreateVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (calcULPError(expectedValues.uint32CreateVal, testValues.uint32CreateVal) > 1u)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("uint32CreateVal", expectedValues.uint32CreateVal, testValues.uint32CreateVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (calcULPError(expectedValues.uint64CreateVal, testValues.uint64CreateVal) > 1u)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("uint64CreateVal", expectedValues.uint64CreateVal, testValues.uint64CreateVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (calcULPError(expectedValues.float32CreateVal, testValues.float32CreateVal) > 1u)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("float32CreateVal", expectedValues.float32CreateVal, testValues.float32CreateVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (expectedValues.float64CreateVal != testValues.float64CreateVal)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("float64CreateVal", expectedValues.float64CreateVal, testValues.float64CreateVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (calcULPError(expectedValues.additionVal, testValues.additionVal) > 1u)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("additionVal", expectedValues.additionVal, testValues.additionVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (calcULPError(expectedValues.substractionVal, testValues.substractionVal) > 1u)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("substractionVal", expectedValues.substractionVal, testValues.substractionVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (calcULPError(expectedValues.multiplicationVal, testValues.multiplicationVal) > 1u)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("multiplicationVal", expectedValues.multiplicationVal, testValues.multiplicationVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (calcULPError(expectedValues.divisionVal, testValues.divisionVal) > 1u)
-        {
-            printOnFailure(Device);
-            printOnArithmeticFailure("divisionVal", expectedValues.divisionVal, testValues.divisionVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (expectedValues.lessOrEqualVal != testValues.lessOrEqualVal)
-        {
-            printOnFailure(Device);
-            printOnComparisonFailure("lessOrEqualVal", expectedValues.lessOrEqualVal, testValues.lessOrEqualVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (expectedValues.greaterOrEqualVal != testValues.greaterOrEqualVal)
-        {
-            printOnFailure(Device);
-            printOnComparisonFailure("greaterOrEqualVal", expectedValues.greaterOrEqualVal, testValues.greaterOrEqualVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (expectedValues.equalVal != testValues.equalVal)
-        {
-            printOnFailure(Device);
-            printOnComparisonFailure("equalVal", expectedValues.equalVal, testValues.equalVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (expectedValues.notEqualVal != testValues.notEqualVal)
-        {
-            printOnFailure(Device);
-            printOnComparisonFailure("notEqualVal", expectedValues.notEqualVal, testValues.notEqualVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (expectedValues.lessVal != testValues.lessVal)
-        {
-            printOnFailure(Device);
-            printOnComparisonFailure("lessVal", expectedValues.lessVal, testValues.lessVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-        if (expectedValues.greaterVal != testValues.greaterVal)
-        {
-            printOnFailure(Device);
-            printOnComparisonFailure("greaterVal", expectedValues.greaterVal, testValues.greaterVal, expectedValues.a, expectedValues.b);
-            success = false;
-        }
-
-        return success;
-    };
-
-    class EF64Submitter
-    {
-    public:
-        EF64Submitter(CompatibilityTest& base)
-            :m_base(base), m_pushConstants({}), m_semaphoreCounter(0)
-        {
-            // setting up pipeline in the constructor
-            m_queueFamily = base.getComputeQueue()->getFamilyIndex();
-            m_semaphore = base.m_device->createSemaphore(0);
-            m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-            if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf))
-                base.logFail("Failed to create Command Buffers!\n");
-
-            // Load shaders, set up pipeline
+   using device_base_t = MonoDeviceApplication;
+   using asset_base_t  = BuiltinResourcesApplication;
+
+   public:
+   CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+   virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
+   {
+      auto retval                   = device_base_t::getPreferredDeviceFeatures();
+      retval.pipelineExecutableInfo = true;
+      return retval;
+   }
+
+   bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+   {
+      // since emulated_float64_t rounds to zero
+      std::fesetround(FE_TOWARDZERO);
+
+      if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+         return false;
+      if (!asset_base_t::onAppInitialized(std::move(system)))
+         return false;
+
+      return true;
+   }
+
+   void onAppTerminated_impl() override
+   {
+      m_device->waitIdle();
+   }
+
+   void workLoopBody() override
+   {
+      if constexpr (DoTests)
+      {
+         emulated_float64_tests();
+      }
+      if constexpr (DoBenchmark)
+      {
+         runEF64Benchmarks();
+      }
+
+      m_keepRunning = false;
+   }
+
+   bool keepRunning() override
+   {
+      return m_keepRunning;
+   }
+
+
+   private:
+   bool m_keepRunning = true;
+
+   constexpr static inline uint32_t EmulatedFloat64TestIterations = 1000u;
+
+   enum class EmulatedFloatTestDevice
+   {
+      CPU,
+      GPU
+   };
+
+   template<bool FastMath, bool FlushDenormToZero, EmulatedFloatTestDevice Device>
+   bool compareEmulatedFloat64TestValues(const TestValues<FastMath, FlushDenormToZero>& expectedValues, const TestValues<FastMath, FlushDenormToZero>& testValues)
+   {
+      bool success = true;
+
+      auto printOnFailure = [this](EmulatedFloatTestDevice device)
+      {
+         std::string errorMsgPrefix = "";
+         if (device == EmulatedFloatTestDevice::CPU)
+            errorMsgPrefix = "CPU test fail:";
+         else
+            errorMsgPrefix = "GPU test fail:";
+
+         m_logger->log("%s", ILogger::ELL_ERROR, errorMsgPrefix.c_str());
+         m_logFile << errorMsgPrefix << '\n';
+      };
+
+      auto printOnArithmeticFailure = [this](const char* valName, uint64_t expectedValue, uint64_t testValue, uint64_t a, uint64_t b)
+      {
+         double expectedAsDouble = reinterpret_cast<double&>(expectedValue);
+         double testAsDouble     = reinterpret_cast<double&>(testValue);
+         double error            = std::abs(expectedAsDouble - testAsDouble);
+
+         std::stringstream ss;
+         ss << "for input values: A = " << reinterpret_cast<double&>(a) << " B = " << reinterpret_cast<double&>(b) << '\n';
+         ss << valName << " not equal!";
+         ss << "\nexpected value: " << std::fixed << std::setprecision(20) << expectedAsDouble;
+         ss << "\ntest value:     " << std::fixed << std::setprecision(20) << testAsDouble;
+         ss << "\nerror = " << error << '\n';
+         ss << "bit representations: \n";
+         ss << "seeeeeeeeeeemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n";
+         ss << std::bitset<64>(expectedValue) << " - expectedValue bit pattern\n";
+         ss << std::bitset<64>(testValue) << " - testValue bit pattern \n";
+
+         m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str());
+         m_logFile << ss.str() << '\n';
+
+         //std::cout << "ULP error: " << std::max(expectedValue, testValue) - std::min(expectedValue, testValue) << "\n\n";
+      };
+
+      auto calcULPError = [](emulated_float64_t::storage_t expectedValue, emulated_float64_t::storage_t testValue)
+      {
+         return std::max(expectedValue, testValue) - std::min(expectedValue, testValue);
+      };
+
+      auto printOnComparisonFailure = [this](const char* valName, int expectedValue, int testValue, double a, double b)
+      {
+         std::string inputValuesStr = std::string("for input values: A = ") + std::to_string(a) + std::string(" B = ") + std::to_string(b);
+
+         m_logger->log("%s", ILogger::ELL_ERROR, inputValuesStr.c_str());
+         m_logFile << inputValuesStr << '\n';
+
+         std::stringstream ss;
+         ss << valName << " not equal!";
+         ss << "\nexpected value: " << std::boolalpha << bool(expectedValue);
+         ss << "\ntest value: " << std::boolalpha << bool(testValue);
+
+         m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str());
+         m_logFile << ss.str() << '\n';
+      };
+
+      if (calcULPError(expectedValues.int32CreateVal, testValues.int32CreateVal) > 1u)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("int32CreateVal", expectedValues.int32CreateVal, testValues.int32CreateVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (calcULPError(expectedValues.int64CreateVal, testValues.int64CreateVal) > 1u)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("int64CreateVal", expectedValues.int64CreateVal, testValues.int64CreateVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (calcULPError(expectedValues.uint32CreateVal, testValues.uint32CreateVal) > 1u)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("uint32CreateVal", expectedValues.uint32CreateVal, testValues.uint32CreateVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (calcULPError(expectedValues.uint64CreateVal, testValues.uint64CreateVal) > 1u)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("uint64CreateVal", expectedValues.uint64CreateVal, testValues.uint64CreateVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (calcULPError(expectedValues.float32CreateVal, testValues.float32CreateVal) > 1u)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("float32CreateVal", expectedValues.float32CreateVal, testValues.float32CreateVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (expectedValues.float64CreateVal != testValues.float64CreateVal)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("float64CreateVal", expectedValues.float64CreateVal, testValues.float64CreateVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (calcULPError(expectedValues.additionVal, testValues.additionVal) > 1u)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("additionVal", expectedValues.additionVal, testValues.additionVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (calcULPError(expectedValues.substractionVal, testValues.substractionVal) > 1u)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("substractionVal", expectedValues.substractionVal, testValues.substractionVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (calcULPError(expectedValues.multiplicationVal, testValues.multiplicationVal) > 1u)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("multiplicationVal", expectedValues.multiplicationVal, testValues.multiplicationVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (calcULPError(expectedValues.divisionVal, testValues.divisionVal) > 1u)
+      {
+         printOnFailure(Device);
+         printOnArithmeticFailure("divisionVal", expectedValues.divisionVal, testValues.divisionVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (expectedValues.lessOrEqualVal != testValues.lessOrEqualVal)
+      {
+         printOnFailure(Device);
+         printOnComparisonFailure("lessOrEqualVal", expectedValues.lessOrEqualVal, testValues.lessOrEqualVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (expectedValues.greaterOrEqualVal != testValues.greaterOrEqualVal)
+      {
+         printOnFailure(Device);
+         printOnComparisonFailure("greaterOrEqualVal", expectedValues.greaterOrEqualVal, testValues.greaterOrEqualVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (expectedValues.equalVal != testValues.equalVal)
+      {
+         printOnFailure(Device);
+         printOnComparisonFailure("equalVal", expectedValues.equalVal, testValues.equalVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (expectedValues.notEqualVal != testValues.notEqualVal)
+      {
+         printOnFailure(Device);
+         printOnComparisonFailure("notEqualVal", expectedValues.notEqualVal, testValues.notEqualVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (expectedValues.lessVal != testValues.lessVal)
+      {
+         printOnFailure(Device);
+         printOnComparisonFailure("lessVal", expectedValues.lessVal, testValues.lessVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+      if (expectedValues.greaterVal != testValues.greaterVal)
+      {
+         printOnFailure(Device);
+         printOnComparisonFailure("greaterVal", expectedValues.greaterVal, testValues.greaterVal, expectedValues.a, expectedValues.b);
+         success = false;
+      }
+
+      return success;
+   };
+
+   class EF64Submitter
+   {
+  public:
+      EF64Submitter(CompatibilityTest& base)
+         : m_base(base), m_pushConstants({}), m_semaphoreCounter(0)
+      {
+         // setting up pipeline in the constructor
+         m_queueFamily = base.getComputeQueue()->getFamilyIndex();
+         m_semaphore   = base.m_device->createSemaphore(0);
+         m_cmdpool     = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+         if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf))
+            base.logFail("Failed to create Command Buffers!\n");
+
+         // Load shaders, set up pipeline
+         {
+            smart_refctd_ptr<IShader> shader;
             {
-                smart_refctd_ptr<IShader> shader;
-                {
-                    IAssetLoader::SAssetLoadParams lp = {};
-                    lp.logger = base.m_logger.get();
-                    lp.workingDirectory = "app_resources"; // virtual root
-
-                    auto key = nbl::this_example::builtin::build::get_spirv_key<"test">(base.m_device.get());
-                    auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp);
-                    const auto assets = assetBundle.getContents();
-                    if (assets.empty())
-                    {
-                        base.logFail("Could not load shader!");
-                        assert(0);
-                    }
-
-                    // It would be super weird if loading a shader from a file produced more than 1 asset
-                    assert(assets.size() == 1);
-                    shader = IAsset::castDown<IShader>(assets[0]);
-                }
-
-                if (!shader)
-                    base.logFail("Failed to load precompiled \"test\" shader!\n");
-
-                nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = {
-                    {
-                        .binding = 0,
-                        .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
-                        .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-                        .stageFlags = ShaderStage::ESS_COMPUTE,
-                        .count = 1
-                    }
-                };
-                smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = base.m_device->createDescriptorSetLayout(bindings);
-                if (!dsLayout)
-                    base.logFail("Failed to create a Descriptor Layout!\n");
-
-                SPushConstantRange pushConstantRanges[] = {
-                {
-                    .stageFlags = ShaderStage::ESS_COMPUTE,
-                    .offset = 0,
-                    .size = sizeof(PushConstants)
-                }
-                };
-                m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout));
-                if (!m_pplnLayout)
-                    base.logFail("Failed to create a Pipeline Layout!\n");
-
-                {
-                    IGPUComputePipeline::SCreationParams params = {};
-                    params.layout = m_pplnLayout.get();
-                    params.shader.entryPoint = "main";
-                    params.shader.shader = shader.get();
-                    if (base.m_device->getEnabledFeatures().pipelineExecutableInfo)
-                    {
-                        params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS;
-                        params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
-                    }
-                    if (!base.m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
-                        base.logFail("Failed to create pipelines (compile & link shaders)!\n");
-
-                    if (base.m_device->getEnabledFeatures().pipelineExecutableInfo)
-                    {
-                        auto report = system::to_string(m_pipeline->getExecutableInfo());
-                        base.m_logger->log("EF64Submitter Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, report.c_str());
-                    }
-                }
-
-                // Allocate the memory
-                {
-                    constexpr size_t BufferSize = sizeof(TestValues<false, true>);
-
-                    nbl::video::IGPUBuffer::SCreationParams params = {};
-                    params.size = BufferSize;
-                    params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-                    smart_refctd_ptr<IGPUBuffer> outputBuff = base.m_device->createBuffer(std::move(params));
-                    if (!outputBuff)
-                        base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
-
-                    outputBuff->setObjectDebugName("emulated_float64_t output buffer");
-
-                    nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
-                    reqs.memoryTypeBits &= base.m_physicalDevice->getHostVisibleMemoryTypeBits();
-
-                    m_allocation = base.m_device->allocate(reqs, outputBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE);
-                    if (!m_allocation.isValid())
-                        base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
-
-                    assert(outputBuff->getBoundMemory().memory == m_allocation.memory.get());
-                    smart_refctd_ptr<nbl::video::IDescriptorPool> pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 });
-
-                    m_ds = pool->createDescriptorSet(std::move(dsLayout));
-                    {
-                        IGPUDescriptorSet::SDescriptorInfo info[1];
-                        info[0].desc = smart_refctd_ptr(outputBuff);
-                        info[0].info.buffer = { .offset = 0,.size = BufferSize };
-                        IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
-                            {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info}
-                        };
-                        base.m_device->updateDescriptorSets(writes, {});
-                    }
-                }
-
-                if (!m_allocation.memory->map({ 0ull,m_allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ))
-                    base.logFail("Failed to map the Device Memory!\n");
+               IAssetLoader::SAssetLoadParams lp = {};
+               lp.logger                         = base.m_logger.get();
+               lp.workingDirectory               = "app_resources"; // virtual root
+
+               auto       key         = nbl::this_example::builtin::build::get_spirv_key<"test">(base.m_device.get());
+               auto       assetBundle = base.m_assetMgr->getAsset(key.data(), lp);
+               const auto assets      = assetBundle.getContents();
+               if (assets.empty())
+               {
+                  base.logFail("Could not load shader!");
+                  assert(0);
+               }
+
+               // It would be super weird if loading a shader from a file produced more than 1 asset
+               assert(assets.size() == 1);
+               shader = IAsset::castDown<IShader>(assets[0]);
             }
 
-            // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
-            const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize());
-            if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-                base.m_device->invalidateMappedMemoryRanges(1, &memoryRange);
-
-            assert(memoryRange.valid() && memoryRange.length >= sizeof(TestValues<false, true>));
-
-            m_queue = m_base.m_device->getQueue(m_queueFamily, 0);
-        }
-
-        ~EF64Submitter() 
-        {
-            m_allocation.memory->unmap();
-        }
-
-        void setPushConstants(PushConstants& pc)
-        {
-            m_pushConstants = pc;
-        }
-
-        TestValues<false, true> submitGetGPUTestValues()
-        {
-            // record command buffer
-            m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-            m_cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
-            m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1));
-            m_cmdbuf->bindComputePipeline(m_pipeline.get());
-            m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
-            m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstants), &m_pushConstants);
-            m_cmdbuf->dispatch(WORKGROUP_SIZE, 1, 1);
-            m_cmdbuf->endDebugMarker();
-            m_cmdbuf->end();
-
-            IQueue::SSubmitInfo submitInfos[1] = {};
-            const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()}};
-            submitInfos[0].commandBuffers = cmdbufs;
-            const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = m_semaphore.get(), .value = ++m_semaphoreCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
-            submitInfos[0].signalSemaphores = signals;
-            
-            m_base.m_api->startCapture();
-            m_queue->submit(submitInfos);
-            m_base.m_api->endCapture();
-
-            m_base.m_device->waitIdle();
-            TestValues<false, true> output;
-            std::memcpy(&output, static_cast<TestValues<false, true>*>(m_allocation.memory->getMappedPointer()), sizeof(TestValues<false, true>));
-            m_base.m_device->waitIdle();
-
-            return output;
-        }
-
-    private:
-        uint32_t m_queueFamily;
-        nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {};
-        smart_refctd_ptr<nbl::video::IGPUCommandBuffer> m_cmdbuf = nullptr;
-        smart_refctd_ptr<nbl::video::IGPUCommandPool> m_cmdpool = nullptr;
-        smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_ds = nullptr;
-        smart_refctd_ptr<nbl::video::IGPUPipelineLayout> m_pplnLayout = nullptr;
-        PushConstants m_pushConstants;
-        CompatibilityTest& m_base;
-        smart_refctd_ptr<nbl::video::IGPUComputePipeline> m_pipeline;
-        smart_refctd_ptr<ISemaphore> m_semaphore;
-        IQueue* m_queue;
-        uint64_t m_semaphoreCounter;
-    };
-
-    void emulated_float64_tests()
-    {
-        EF64Submitter submitter(*this);
-
-        auto printTestOutput = [this](const std::string& functionName, const EmulatedFloat64TestOutput& testResult)
-            {
-                std::cout << functionName << ": " << std::endl;
-
-                if (!testResult.cpuTestsSucceed)
-                    logFail("Incorrect CPU determinated values!");
-                else
-                    m_logger->log("Correct CPU determinated values!", ILogger::ELL_PERFORMANCE);
-
-                if (!testResult.gpuTestsSucceed)
-                    logFail("Incorrect GPU determinated values!");
-                else
-                    m_logger->log("Correct GPU determinated values!", ILogger::ELL_PERFORMANCE);
-            };
-
-        m_logFile.open("EmulatedFloatTestLog.txt", std::ios::out | std::ios::trunc);
-        if (!m_logFile.is_open())
-            m_logger->log("Failed to open log file!", system::ILogger::ELL_ERROR);
-
-        printTestOutput("emulatedFloat64RandomValuesTest", emulatedFloat64RandomValuesTest(submitter));
-        printTestOutput("emulatedFloat64RandomValuesTestContrastingExponents", emulatedFloat64RandomValuesTestContrastingExponents(submitter));
-        printTestOutput("emulatedFloat64NegAndPosZeroTest", emulatedFloat64NegAndPosZeroTest(submitter));
-        printTestOutput("emulatedFloat64BothValuesInfTest", emulatedFloat64BothValuesInfTest(submitter));
-        printTestOutput("emulatedFloat64BothValuesNegInfTest", emulatedFloat64BothValuesNegInfTest(submitter));
-        printTestOutput("emulatedFloat64OneValIsInfOtherIsNegInfTest", emulatedFloat64OneValIsInfOtherIsNegInfTest(submitter));
-        printTestOutput("emulatedFloat64OneValIsInfTest", emulatedFloat64OneValIsInfTest(submitter));
-        printTestOutput("emulatedFloat64OneValIsNegInfTest", emulatedFloat64OneValIsNegInfTest(submitter));
-        if(false) // doesn't work for some reason + fast math is enabled by default
-            printTestOutput("emulatedFloat64BNaNTest", emulatedFloat64BNaNTest(submitter));
-        printTestOutput("emulatedFloat64BInfTest", emulatedFloat64OneValIsZeroTest(submitter));
-        printTestOutput("emulatedFloat64BNegInfTest", emulatedFloat64OneValIsNegZeroTest(submitter));
-
-        m_logFile.close();
-    }
-
-    template <bool FastMath, bool FlushDenormToZero>
-    struct EmulatedFloat64TestValuesInfo
-    {
-        emulated_float64_t<FastMath, FlushDenormToZero> a;
-        emulated_float64_t<FastMath, FlushDenormToZero> b;
-        ConstructorTestValues constrTestValues;
-        TestValues<FastMath, FlushDenormToZero> expectedTestValues;
-        
-        void fillExpectedTestValues()
-        {
-            double aAsDouble = reinterpret_cast<double&>(a);
-            double bAsDouble = reinterpret_cast<double&>(b);
-
-            expectedTestValues.a = a.data;
-            expectedTestValues.b = b.data;
-
-            expectedTestValues.int32CreateVal = bit_cast<uint64_t>(double(constrTestValues.int32));
-            expectedTestValues.int64CreateVal = bit_cast<uint64_t>(double(constrTestValues.int64));
-            expectedTestValues.uint32CreateVal = bit_cast<uint64_t>(double(constrTestValues.uint32));
-            expectedTestValues.uint64CreateVal = bit_cast<uint64_t>(double(constrTestValues.uint64));
-            expectedTestValues.float32CreateVal = bit_cast<uint64_t>(double(constrTestValues.float32));
-            expectedTestValues.float64CreateVal = bit_cast<uint64_t>(constrTestValues.float64);
-            expectedTestValues.additionVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(aAsDouble + bAsDouble).data;
-            expectedTestValues.substractionVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(aAsDouble - bAsDouble).data;
-            expectedTestValues.multiplicationVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(aAsDouble * bAsDouble).data;
-            expectedTestValues.divisionVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(aAsDouble / bAsDouble).data;
-            expectedTestValues.lessOrEqualVal = aAsDouble <= bAsDouble;
-            expectedTestValues.greaterOrEqualVal = aAsDouble >= bAsDouble;
-            expectedTestValues.equalVal = aAsDouble == bAsDouble;
-            expectedTestValues.notEqualVal = aAsDouble != bAsDouble;
-            expectedTestValues.lessVal = aAsDouble < bAsDouble;
-            expectedTestValues.greaterVal = aAsDouble > bAsDouble;
-        }
-    };
-
-    struct EmulatedFloat64TestOutput
-    {
-        bool cpuTestsSucceed;
-        bool gpuTestsSucceed;
-    };
-
-    EmulatedFloat64TestOutput emulatedFloat64LoopedTests_impl(EF64Submitter& submitter, 
-        const uint32_t iterations,
-        const std::function<double()>& determineValueA, 
-        const std::function<double()>& determineValueB)
-    {
-        EmulatedFloat64TestOutput output = { true, true };
-
-        std::uniform_int_distribution i32Distribution(-std::numeric_limits<int>::max(), std::numeric_limits<int>::max());
-        std::uniform_int_distribution i64Distribution(-std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max());
-        std::uniform_int_distribution u32Distribution(-std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max());
-        std::uniform_int_distribution u64Distribution(-std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max());
-        std::uniform_real_distribution fDistribution(-100000.0, 100000.0);
-        
-        std::random_device rd;
-        std::mt19937 mt(rd());
-
-        for (uint32_t i = 0u; i < iterations; ++i)
-        {
-            // generate random test values
-            EmulatedFloat64TestValuesInfo<false, true> testValInfo;
-            double aTmp = determineValueA();
-            double bTmp = determineValueB();
-            testValInfo.a.data = reinterpret_cast<emulated_float64_t<false, true>::storage_t&>(aTmp);
-            testValInfo.b.data = reinterpret_cast<emulated_float64_t<false, true>::storage_t&>(bTmp);
-            testValInfo.constrTestValues.int32 = i32Distribution(mt);
-            testValInfo.constrTestValues.int64 = i64Distribution(mt);
-            testValInfo.constrTestValues.uint32 = u32Distribution(mt);
-            testValInfo.constrTestValues.uint64 = u64Distribution(mt);
-            testValInfo.constrTestValues.float32 = fDistribution(mt);
-            testValInfo.constrTestValues.float64 = fDistribution(mt);
-
-            testValInfo.fillExpectedTestValues();
-            auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
-
-            if (!singleTestOutput.cpuTestsSucceed)
-                output.cpuTestsSucceed = false;
-            if (!singleTestOutput.gpuTestsSucceed)
-                output.gpuTestsSucceed = false;
-        }
-
-        return output;
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64RandomValuesTest(EF64Submitter& submitter)
-    {
-        auto getRandomFloat64 = []()
-            {
-                static std::random_device rd;
-                static std::mt19937 mt(rd());
-                static std::uniform_real_distribution distribution(-100000.0, 100000.0);
-
-
-                return distribution(mt);
-            };
-
-        return emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations, getRandomFloat64, getRandomFloat64);
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64RandomValuesTestContrastingExponents(EF64Submitter& submitter)
-    {
-        auto getRandomSmallFloat64 = []()
-            {
-                static std::random_device rd;
-                static std::mt19937 mt(rd());
-                static std::uniform_real_distribution distribution(-0.01, 0.01);
-
-                return distribution(mt);
-            };
-
-        auto getRandomLargeFloat64 = []()
-            {
-                static std::random_device rd;
-                static std::mt19937 mt(rd());
-                static std::uniform_real_distribution distribution(1000000000.0, 2000000000.0);
-                static std::uniform_int_distribution coinFlipDistribution(0, 1);
-
-                double output = distribution(mt);
-                if (coinFlipDistribution(mt))
-                    output = -output;
-
-                return output;
-            };
-
-        EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomSmallFloat64, getRandomLargeFloat64);
-        EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomLargeFloat64, getRandomSmallFloat64);
-
-        EmulatedFloat64TestOutput output;
-        output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
-        output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
-        return output;
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64BothValuesNaNTest(EF64Submitter& submitter)
-    {
-        smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
-
-        EmulatedFloat64TestValuesInfo<false, true> testValInfo;
-        const float32_t nan32 = std::numeric_limits<float32_t>::quiet_NaN();
-        const float64_t nan64 = std::numeric_limits<float64_t>::quiet_NaN();
-        testValInfo.a = emulated_float64_t<false, true>::create(nan64);
-        testValInfo.b = emulated_float64_t<false, true>::create(nan64);
-        testValInfo.constrTestValues = {
-            .int32 = std::bit_cast<int32_t>(nan32),
-            .int64 = std::bit_cast<int64_t>(nan64),
-            .uint32 = std::bit_cast<uint32_t>(nan32),
-            .uint64 = std::bit_cast<uint64_t>(nan64),
-            .float32 = nan32
-            //.float64 = nan64
-        };
-
-        testValInfo.fillExpectedTestValues();
-        return performEmulatedFloat64Tests(testValInfo, submitter);
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64NegAndPosZeroTest(EF64Submitter& submitter)
-    {
-        smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
-
-        EmulatedFloat64TestValuesInfo<false, true> testValInfo;
-        testValInfo.a = emulated_float64_t<false, true>::create(ieee754::traits<float64_t>::signMask);
-        testValInfo.b = emulated_float64_t<false, true>::create(std::bit_cast<uint64_t>(0.0));
-        testValInfo.constrTestValues = {
-            .int32 = 0,
-            .int64 = 0,
-            .uint32 = 0,
-            .uint64 = 0,
-            .float32 = 0
-        };
-
-        testValInfo.fillExpectedTestValues();
-        auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
-        std::swap(testValInfo.a, testValInfo.b);
-        testValInfo.fillExpectedTestValues();
-        auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
-
-        return { firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed };
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64BothValuesInfTest(EF64Submitter& submitter)
-    {
-        smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
-
-        EmulatedFloat64TestValuesInfo<false, true> testValInfo;
-        const float32_t inf32 = std::numeric_limits<float32_t>::infinity();
-        const float64_t inf64 = std::numeric_limits<float64_t>::infinity();
-        testValInfo.a = emulated_float64_t<false, true>::create(inf64);
-        testValInfo.b = emulated_float64_t<false, true>::create(inf64);
-        testValInfo.constrTestValues = {
-            .int32 = 0,
-            .int64 = 0,
-            .uint32 = 0,
-            .uint64 = 0,
-            .float32 = inf32
-            //.float64 = inf64
-        };
-
-        testValInfo.fillExpectedTestValues();
-        return performEmulatedFloat64Tests(testValInfo, submitter);
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64BothValuesNegInfTest(EF64Submitter& submitter)
-    {
-        smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
-
-        EmulatedFloat64TestValuesInfo<false, true> testValInfo;
-        const float32_t inf32 = -std::numeric_limits<float32_t>::infinity();
-        const float64_t inf64 = -std::numeric_limits<float64_t>::infinity();
-        testValInfo.a = emulated_float64_t<false, true>::create(inf64);
-        testValInfo.b = emulated_float64_t<false, true>::create(inf64);
-        testValInfo.constrTestValues = {
-            .int32 = 0,
-            .int64 = 0,
-            .uint32 = 0,
-            .uint64 = 0,
-            .float32 = inf32
-            //.float64 = inf64
-        };
-
-        testValInfo.fillExpectedTestValues();
-        return performEmulatedFloat64Tests(testValInfo, submitter);
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64OneValIsInfOtherIsNegInfTest(EF64Submitter& submitter)
-    {
-        smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
-
-        EmulatedFloat64TestValuesInfo<false, true> testValInfo;
-        const float64_t inf64 = -std::numeric_limits<float64_t>::infinity();
-        testValInfo.a = emulated_float64_t<false, true>::create(inf64);
-        testValInfo.b = emulated_float64_t<false, true>::create(inf64);
-        testValInfo.constrTestValues = {
-            .int32 = 0,
-            .int64 = 0,
-            .uint32 = 0,
-            .uint64 = 0,
-            .float32 = 0
-            //.float64 = inf64
-        };
-
-        testValInfo.fillExpectedTestValues();
-        auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
-        std::swap(testValInfo.a, testValInfo.b);
-        testValInfo.fillExpectedTestValues();
-        auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
-
-        return { firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed };
-    }
-
-    // TODO: fix
-    EmulatedFloat64TestOutput emulatedFloat64BNaNTest(EF64Submitter& submitter)
-    {
-        EmulatedFloat64TestOutput output = { true, true };
-        smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
-
-        for (uint32_t i = 0u; i < EmulatedFloat64TestIterations; ++i)
-        {
-            std::random_device rd;
-            std::mt19937 mt(rd());
-
-            std::uniform_int_distribution i32Distribution(-std::numeric_limits<int>::max(), std::numeric_limits<int>::max());
-            std::uniform_int_distribution i64Distribution(-std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max());
-            std::uniform_int_distribution u32Distribution(-std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max());
-            std::uniform_int_distribution u64Distribution(-std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max());
-            std::uniform_real_distribution f32Distribution(-100000.0f, 100000.0f);
-            std::uniform_real_distribution f64Distribution(-100000.0, 100000.0);
-
-            EmulatedFloat64TestValuesInfo<false, true> testValInfo;
-            double aTmp = f64Distribution(mt);
-            double bTmp = std::numeric_limits<float64_t>::quiet_NaN();
-            testValInfo.a.data = reinterpret_cast<emulated_float64_t<false, true>::storage_t&>(aTmp);
-            testValInfo.b.data = reinterpret_cast<emulated_float64_t<false, true>::storage_t&>(bTmp);
-            testValInfo.constrTestValues.int32 = i32Distribution(mt);
-            testValInfo.constrTestValues.int64 = i64Distribution(mt);
-            testValInfo.constrTestValues.uint32 = u32Distribution(mt);
-            testValInfo.constrTestValues.uint64 = u64Distribution(mt);
-            testValInfo.constrTestValues.float32 = f32Distribution(mt);
-            //testValInfo.constrTestValues.float64 = f64Distribution(mt);
-
-            testValInfo.fillExpectedTestValues();
-            auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
-
-            if (!singleTestOutput.cpuTestsSucceed)
-                output.cpuTestsSucceed = false;
-            if (!singleTestOutput.gpuTestsSucceed)
-                output.gpuTestsSucceed = false;
-        }
-
-        return output;
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64OneValIsInfTest(EF64Submitter& submitter)
-    {
-        auto getRandomFloat64 = []()
-            {
-                static std::random_device rd;
-                static std::mt19937 mt(rd());
-                static std::uniform_real_distribution distribution(-100000.0, 100000.0);
-
-                return distribution(mt);
-            };
-
-        auto getInfinity = []()
-            {
-                return std::numeric_limits<float64_t>::infinity();
-            };
-
-        EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getInfinity);
-        EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getInfinity, getRandomFloat64);
-
-        EmulatedFloat64TestOutput output;
-        output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
-        output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
-        return output;
-    }
+            if (!shader)
+               base.logFail("Failed to load precompiled \"test\" shader!\n");
+
+            nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = {
+               {.binding       = 0,
+                  .type        = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+                  .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+                  .stageFlags  = ShaderStage::ESS_COMPUTE,
+                  .count       = 1}};
+            smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = base.m_device->createDescriptorSetLayout(bindings);
+            if (!dsLayout)
+               base.logFail("Failed to create a Descriptor Layout!\n");
+
+            SPushConstantRange pushConstantRanges[] = {
+               {.stageFlags = ShaderStage::ESS_COMPUTE,
+                  .offset   = 0,
+                  .size     = sizeof(PushConstants)}};
+            m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout));
+            if (!m_pplnLayout)
+               base.logFail("Failed to create a Pipeline Layout!\n");
 
-    EmulatedFloat64TestOutput emulatedFloat64OneValIsNegInfTest(EF64Submitter& submitter)
-    {
-        auto getRandomFloat64 = []()
             {
-                static std::random_device rd;
-                static std::mt19937 mt(rd());
-                static std::uniform_real_distribution distribution(-100000.0, 100000.0);
-
-
-                return distribution(mt);
-            };
-
-        auto getNegInfinity = []()
-            {
-                return -std::numeric_limits<float64_t>::infinity();
-            };
-
-        EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegInfinity);
-        EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegInfinity, getRandomFloat64);
-
-        EmulatedFloat64TestOutput output;
-        output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
-        output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
-        return output;
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64OneValIsZeroTest(EF64Submitter& submitter)
-    {
-        auto getRandomFloat64 = []()
-            {
-                static std::random_device rd;
-                static std::mt19937 mt(rd());
-                static std::uniform_real_distribution distribution(-100000.0, 100000.0);
-
-                return distribution(mt);
-            };
-
-        auto getZero = []()
-            {
-                return 0.0;
-            };
-
-        EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getZero);
-        EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getZero, getRandomFloat64);
-
-        EmulatedFloat64TestOutput output; 
-        output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
-        output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
-        return output;
-    }
-
-    EmulatedFloat64TestOutput emulatedFloat64OneValIsNegZeroTest(EF64Submitter& submitter)
-    {
-        auto getRandomFloat64 = []()
-            {
-                static std::random_device rd;
-                static std::mt19937 mt(rd());
-                static std::uniform_real_distribution distribution(-100000.0, 100000.0);
-
-                return distribution(mt);
-            };
+               IGPUComputePipeline::SCreationParams params = {};
+               params.layout                               = m_pplnLayout.get();
+               params.shader.entryPoint                    = "main";
+               params.shader.shader                        = shader.get();
+               if (base.m_device->getEnabledFeatures().pipelineExecutableInfo)
+               {
+                  params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS;
+                  params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
+               }
+               if (!base.m_device->createComputePipelines(nullptr, {&params, 1}, &m_pipeline))
+                  base.logFail("Failed to create pipelines (compile & link shaders)!\n");
+
+               if (base.m_device->getEnabledFeatures().pipelineExecutableInfo)
+               {
+                  auto report = system::to_string(m_pipeline->getExecutableInfo());
+                  base.m_logger->log("EF64Submitter Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, report.c_str());
+               }
+            }
 
-        auto getNegZero = []()
-            {
-                return -0.0;
-            };
-
-        EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegZero);
-        EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegZero, getRandomFloat64);
-
-        EmulatedFloat64TestOutput output;
-        output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
-        output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
-        return output;
-    }
-
-    template <bool FastMath, bool FlushDenormToZero>
-    EmulatedFloat64TestOutput performEmulatedFloat64Tests(EmulatedFloat64TestValuesInfo<FastMath, FlushDenormToZero>& testValInfo, EF64Submitter& submitter)
-    {
-        emulated_float64_t<false, true> a = testValInfo.a;
-        emulated_float64_t<false, true> b = testValInfo.b;
-
-        const TestValues<FastMath, FlushDenormToZero> cpuTestValues = {
-            .int32CreateVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.int32).data,
-            .int64CreateVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.int64).data,
-            .uint32CreateVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.uint32).data,
-            .uint64CreateVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.uint64).data,
-            .float32CreateVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.float32).data,
-            .float64CreateVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.float64).data,
-            .additionVal = (a + b).data,
-            .substractionVal = (a - b).data,
-            .multiplicationVal = (a * b).data,
-            .divisionVal = (a / b).data,
-            .lessOrEqualVal = a <= b,
-            .greaterOrEqualVal = a >= b,
-            .equalVal = a == b,
-            .notEqualVal = a != b,
-            .lessVal = a < b,
-            .greaterVal = a > b
-        };
-
-        EmulatedFloat64TestOutput output;
-
-        // cpu validation
-        output.cpuTestsSucceed = compareEmulatedFloat64TestValues<false, true, EmulatedFloatTestDevice::CPU>(testValInfo.expectedTestValues, cpuTestValues);
-
-        // gpu validation
-        PushConstants pc;
-        pc.a = reinterpret_cast<uint64_t&>(a);
-        pc.b = reinterpret_cast<uint64_t&>(b);
-        pc.constrTestVals = testValInfo.constrTestValues;
-        
-        submitter.setPushConstants(pc);
-        auto gpuTestValues = submitter.submitGetGPUTestValues();
-
-        output.gpuTestsSucceed = compareEmulatedFloat64TestValues<false, true, EmulatedFloatTestDevice::GPU>(testValInfo.expectedTestValues, gpuTestValues);
-
-        return output;
-    }
-
-    class EF64Benchmark final
-    {
-    public:
-        EF64Benchmark(CompatibilityTest& base)
-        {
-            m_device = base.m_device;
-            m_logger = base.m_logger;
-            m_api = base.m_api;
-
-            // setting up pipeline in the constructor
-            m_queueFamily = base.getComputeQueue()->getFamilyIndex();
-            m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-            if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf))
-                base.logFail("Failed to create Command Buffers!\n");
-
-            // Load shaders, set up pipeline
+            // Allocate the memory
             {
-                smart_refctd_ptr<IShader> shader;
-                {
-                    IAssetLoader::SAssetLoadParams lp = {};
-                    lp.logger = base.m_logger.get();
-                    lp.workingDirectory = "app_resources"; // virtual root
-                    // this time we load a shader directly from a file
-                    auto key = nbl::this_example::builtin::build::get_spirv_key<"benchmark">(m_device.get());
-                    auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp);
-                    const auto assets = assetBundle.getContents();
-                    if (assets.empty())
-                    {
-                        base.logFail("Could not load shader!");
-                        assert(0);
-                    }
-
-                    // It would be super weird if loading a shader from a file produced more than 1 asset
-                    assert(assets.size() == 1);
-                    shader = IAsset::castDown<IShader>(assets[0]);
-                }
-
-                if (!shader)
-                    base.logFail("Failed to load precompiled \"benchmark\" shader!\n");
-
-                nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = {
-                    {
-                        .binding = 0,
-                        .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
-                        .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
-                        .stageFlags = ShaderStage::ESS_COMPUTE,
-                        .count = 1
-                    }
-                };
-                smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = base.m_device->createDescriptorSetLayout(bindings);
-                if (!dsLayout)
-                    base.logFail("Failed to create a Descriptor Layout!\n");
-
-                SPushConstantRange pushConstantRanges[] = {
-                    {
-                        .stageFlags = ShaderStage::ESS_COMPUTE,
-                        .offset = 0,
-                        .size = sizeof(BenchmarkPushConstants)
-                    }
-                };
-                m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout));
-                if (!m_pplnLayout)
-                    base.logFail("Failed to create a Pipeline Layout!\n");
-
-                {
-                    IGPUComputePipeline::SCreationParams params = {};
-                    params.layout = m_pplnLayout.get();
-                    params.shader.entryPoint = "main";
-                    params.shader.shader = shader.get();
-                    if (base.m_device->getEnabledFeatures().pipelineExecutableInfo)
-                    {
-                        params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS;
-                        params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
-                    }
-                    if (!base.m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
-                        base.logFail("Failed to create pipelines (compile & link shaders)!\n");
-
-                    if (base.m_device->getEnabledFeatures().pipelineExecutableInfo)
-                    {
-                        auto report = system::to_string(m_pipeline->getExecutableInfo());
-                        base.m_logger->log("EF64Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, report.c_str());
-                    }
-                }
-
-                // Allocate the memory
-                {
-                    static_assert(sizeof(float64_t) == sizeof(benchmark_emulated_float64_t));
-                    constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X *
-                        BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(float64_t);
-
-                    nbl::video::IGPUBuffer::SCreationParams params = {};
-                    params.size = BufferSize;
-                    params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
-                    smart_refctd_ptr<IGPUBuffer> dummyBuff = base.m_device->createBuffer(std::move(params));
-                    if (!dummyBuff)
-                        base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
-
-                    dummyBuff->setObjectDebugName("benchmark buffer");
-
-                    nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs();
-                    reqs.memoryTypeBits &= base.m_physicalDevice->getDeviceLocalMemoryTypeBits();
-
-                    m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE);
-                    if (!m_allocation.isValid())
-                        base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
-
-                    assert(dummyBuff->getBoundMemory().memory == m_allocation.memory.get());
-                    smart_refctd_ptr<nbl::video::IDescriptorPool> pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 });
-
-                    m_ds = pool->createDescriptorSet(std::move(dsLayout));
-                    {
-                        IGPUDescriptorSet::SDescriptorInfo info[1];
-                        info[0].desc = smart_refctd_ptr(dummyBuff);
-                        info[0].info.buffer = { .offset = 0,.size = BufferSize };
-                        IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
-                            {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info}
-                        };
-                        base.m_device->updateDescriptorSets(writes, {});
-                    }
-                }
+               constexpr size_t BufferSize = sizeof(TestValues<false, true>);
+
+               nbl::video::IGPUBuffer::SCreationParams params = {};
+               params.size                                    = BufferSize;
+               params.usage                                   = IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+               smart_refctd_ptr<IGPUBuffer> outputBuff        = base.m_device->createBuffer(std::move(params));
+               if (!outputBuff)
+                  base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size);
+
+               outputBuff->setObjectDebugName("emulated_float64_t output buffer");
+
+               nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
+               reqs.memoryTypeBits &= base.m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+               m_allocation = base.m_device->allocate(reqs, outputBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE);
+               if (!m_allocation.isValid())
+                  base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+               assert(outputBuff->getBoundMemory().memory == m_allocation.memory.get());
+               smart_refctd_ptr<nbl::video::IDescriptorPool> pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1});
+
+               m_ds = pool->createDescriptorSet(std::move(dsLayout));
+               {
+                  IGPUDescriptorSet::SDescriptorInfo info[1];
+                  info[0].desc                                     = smart_refctd_ptr(outputBuff);
+                  info[0].info.buffer                              = {.offset = 0, .size = BufferSize};
+                  IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
+                     {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}};
+                  base.m_device->updateDescriptorSets(writes, {});
+               }
             }
 
-            IQueryPool::SCreationParams queryPoolCreationParams{};
-            queryPoolCreationParams.queryType = IQueryPool::TYPE::TIMESTAMP;
-            queryPoolCreationParams.queryCount = 2;
-            queryPoolCreationParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
-            m_queryPool = m_device->createQueryPool(queryPoolCreationParams);
-
-            m_computeQueue = m_device->getQueue(m_queueFamily, 0);
-        }
-
-        void run()
-        {
-            m_logger->log("\n\nfloat64_t benchmark result:", ILogger::ELL_PERFORMANCE);
-            performBenchmark(EF64_BENCHMARK_MODE::NATIVE);
-            m_logger->log("emulated_float64_t benchmark, fast math enabled result:", ILogger::ELL_PERFORMANCE);
-            performBenchmark(EF64_BENCHMARK_MODE::EF64_FAST_MATH_ENABLED);
-            m_logger->log("emulated_float64_t benchmark, fast math disabled result:", ILogger::ELL_PERFORMANCE);
-            performBenchmark(EF64_BENCHMARK_MODE::EF64_FAST_MATH_DISABLED);
-            // every subgroup with even ID do calculations with the `emulated_float64_t<false, true>` type, other subgroups do calculations with float64_t
-            m_logger->log("emulated_float64_t benchmark, subgroup divided work result:", ILogger::ELL_PERFORMANCE);
-            performBenchmark(EF64_BENCHMARK_MODE::SUBGROUP_DIVIDED_WORK);
-            // every item does calculations with both emulated and native types
-            m_logger->log("emulated_float64_t benchmark, interleaved result:", ILogger::ELL_PERFORMANCE);
-            performBenchmark(EF64_BENCHMARK_MODE::INTERLEAVED);
-        }
-
-    private:
-        void performBenchmark(EF64_BENCHMARK_MODE mode)
-        {
-            m_device->waitIdle();
-
-            m_pushConstants.benchmarkMode = mode;
-
-            // [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches] in one cmdbuf,
-            // one submit. Per-submit semaphore chaining adds sync cost and blocks driver pipelining;
-            // the cooldown keeps the GPU in steady state across ts 1 so the trailing bench
-            // dispatches don't land in a winding-down tail.
-            constexpr int CooldownIterations = WarmupIterations;
-
-            m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-            m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1));
-            m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
-            m_cmdbuf->bindComputePipeline(m_pipeline.get());
-            m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
-            m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants);
-            for (int i = 0; i < WarmupIterations; ++i)
-                m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1);
-            m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
-            for (int i = 0; i < Iterations; ++i)
-                m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1);
-            m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
-            for (int i = 0; i < CooldownIterations; ++i)
-                m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1);
-            m_cmdbuf->endDebugMarker();
-            m_cmdbuf->end();
-
-            smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0u);
-            const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} };
-            const IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = {
-                {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}
-            };
-            IQueue::SSubmitInfo submit = {};
-            submit.commandBuffers = cmdbufs;
-            submit.signalSemaphores = signalSem;
-
-            m_api->startCapture();
-            m_computeQueue->submit({&submit, 1u});
-            m_api->endCapture();
-
-            m_device->waitIdle();
-
-            const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed();
-            const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0;
-
-            m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds);
-        }
-
-        uint64_t calcTimeElapsed()
-        {
-            uint64_t timestamps[2];
-            const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
-            m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, &timestamps, sizeof(uint64_t), flags);
-            return timestamps[1] - timestamps[0];
-        }
-
-    private:
-        core::smart_refctd_ptr<video::CVulkanConnection> m_api;
-        smart_refctd_ptr<ILogicalDevice> m_device;
-        smart_refctd_ptr<ILogger> m_logger;
-
-        nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {};
-        smart_refctd_ptr<nbl::video::IGPUCommandPool> m_cmdpool = nullptr;
-        smart_refctd_ptr<nbl::video::IGPUCommandBuffer> m_cmdbuf = nullptr;
-        smart_refctd_ptr<nbl::video::IGPUDescriptorSet> m_ds = nullptr;
-        smart_refctd_ptr<nbl::video::IGPUPipelineLayout> m_pplnLayout = nullptr;
-        BenchmarkPushConstants m_pushConstants;
-        smart_refctd_ptr<nbl::video::IGPUComputePipeline> m_pipeline;
-
-        smart_refctd_ptr<nbl::video::IQueryPool> m_queryPool = nullptr;
-
-        uint32_t m_queueFamily;
-        IQueue* m_computeQueue;
-        static constexpr int WarmupIterations = 1000;
-        static constexpr int Iterations = 1000;
-        using benchmark_emulated_float64_t = emulated_float64_t<false, true>;
-    };
-
-    template<typename... Args>
-    inline bool logFail(const char* msg, Args&&... args)
-    {
-        m_logger->log(msg, ILogger::ELL_ERROR, std::forward<Args>(args)...);
-        return false;
-    }
-
-    std::ofstream m_logFile;
+            if (!m_allocation.memory->map({0ull, m_allocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ))
+               base.logFail("Failed to map the Device Memory!\n");
+         }
+
+         // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
+         const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize());
+         if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+            base.m_device->invalidateMappedMemoryRanges(1, &memoryRange);
+
+         assert(memoryRange.valid() && memoryRange.length >= sizeof(TestValues<false, true>));
+
+         m_queue = m_base.m_device->getQueue(m_queueFamily, 0);
+      }
+
+      ~EF64Submitter()
+      {
+         m_allocation.memory->unmap();
+      }
+
+      void setPushConstants(PushConstants& pc)
+      {
+         m_pushConstants = pc;
+      }
+
+      TestValues<false, true> submitGetGPUTestValues()
+      {
+         // record command buffer
+         m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+         m_cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
+         m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1));
+         m_cmdbuf->bindComputePipeline(m_pipeline.get());
+         m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
+         m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstants), &m_pushConstants);
+         m_cmdbuf->dispatch(WORKGROUP_SIZE, 1, 1);
+         m_cmdbuf->endDebugMarker();
+         m_cmdbuf->end();
+
+         IQueue::SSubmitInfo                           submitInfos[1] = {};
+         const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[]      = {{.cmdbuf = m_cmdbuf.get()}};
+         submitInfos[0].commandBuffers                                = cmdbufs;
+         const IQueue::SSubmitInfo::SSemaphoreInfo signals[]          = {{.semaphore = m_semaphore.get(), .value = ++m_semaphoreCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
+         submitInfos[0].signalSemaphores                              = signals;
+
+         m_base.m_api->startCapture();
+         m_queue->submit(submitInfos);
+         m_base.m_api->endCapture();
+
+         m_base.m_device->waitIdle();
+         TestValues<false, true> output;
+         std::memcpy(&output, static_cast<TestValues<false, true>*>(m_allocation.memory->getMappedPointer()), sizeof(TestValues<false, true>));
+         m_base.m_device->waitIdle();
+
+         return output;
+      }
+
+  private:
+      uint32_t                                          m_queueFamily;
+      nbl::video::IDeviceMemoryAllocator::SAllocation   m_allocation = {};
+      smart_refctd_ptr<nbl::video::IGPUCommandBuffer>   m_cmdbuf     = nullptr;
+      smart_refctd_ptr<nbl::video::IGPUCommandPool>     m_cmdpool    = nullptr;
+      smart_refctd_ptr<nbl::video::IGPUDescriptorSet>   m_ds         = nullptr;
+      smart_refctd_ptr<nbl::video::IGPUPipelineLayout>  m_pplnLayout = nullptr;
+      PushConstants                                     m_pushConstants;
+      CompatibilityTest&                                m_base;
+      smart_refctd_ptr<nbl::video::IGPUComputePipeline> m_pipeline;
+      smart_refctd_ptr<ISemaphore>                      m_semaphore;
+      IQueue*                                           m_queue;
+      uint64_t                                          m_semaphoreCounter;
+   };
+
+   void emulated_float64_tests()
+   {
+      EF64Submitter submitter(*this);
+
+      auto printTestOutput = [this](const std::string& functionName, const EmulatedFloat64TestOutput& testResult)
+      {
+         std::cout << functionName << ": " << std::endl;
+
+         if (!testResult.cpuTestsSucceed)
+            logFail("Incorrect CPU determinated values!");
+         else
+            m_logger->log("Correct CPU determinated values!", ILogger::ELL_PERFORMANCE);
+
+         if (!testResult.gpuTestsSucceed)
+            logFail("Incorrect GPU determinated values!");
+         else
+            m_logger->log("Correct GPU determinated values!", ILogger::ELL_PERFORMANCE);
+      };
+
+      m_logFile.open("EmulatedFloatTestLog.txt", std::ios::out | std::ios::trunc);
+      if (!m_logFile.is_open())
+         m_logger->log("Failed to open log file!", system::ILogger::ELL_ERROR);
+
+      printTestOutput("emulatedFloat64RandomValuesTest", emulatedFloat64RandomValuesTest(submitter));
+      printTestOutput("emulatedFloat64RandomValuesTestContrastingExponents", emulatedFloat64RandomValuesTestContrastingExponents(submitter));
+      printTestOutput("emulatedFloat64NegAndPosZeroTest", emulatedFloat64NegAndPosZeroTest(submitter));
+      printTestOutput("emulatedFloat64BothValuesInfTest", emulatedFloat64BothValuesInfTest(submitter));
+      printTestOutput("emulatedFloat64BothValuesNegInfTest", emulatedFloat64BothValuesNegInfTest(submitter));
+      printTestOutput("emulatedFloat64OneValIsInfOtherIsNegInfTest", emulatedFloat64OneValIsInfOtherIsNegInfTest(submitter));
+      printTestOutput("emulatedFloat64OneValIsInfTest", emulatedFloat64OneValIsInfTest(submitter));
+      printTestOutput("emulatedFloat64OneValIsNegInfTest", emulatedFloat64OneValIsNegInfTest(submitter));
+      if (false) // doesn't work for some reason + fast math is enabled by default
+         printTestOutput("emulatedFloat64BNaNTest", emulatedFloat64BNaNTest(submitter));
+      printTestOutput("emulatedFloat64BInfTest", emulatedFloat64OneValIsZeroTest(submitter));
+      printTestOutput("emulatedFloat64BNegInfTest", emulatedFloat64OneValIsNegZeroTest(submitter));
+
+      m_logFile.close();
+   }
+
+   template<bool FastMath, bool FlushDenormToZero>
+   struct EmulatedFloat64TestValuesInfo
+   {
+      emulated_float64_t<FastMath, FlushDenormToZero> a;
+      emulated_float64_t<FastMath, FlushDenormToZero> b;
+      ConstructorTestValues                           constrTestValues;
+      TestValues<FastMath, FlushDenormToZero>         expectedTestValues;
+
+      void fillExpectedTestValues()
+      {
+         double aAsDouble = reinterpret_cast<double&>(a);
+         double bAsDouble = reinterpret_cast<double&>(b);
+
+         expectedTestValues.a = a.data;
+         expectedTestValues.b = b.data;
+
+         expectedTestValues.int32CreateVal    = bit_cast<uint64_t>(double(constrTestValues.int32));
+         expectedTestValues.int64CreateVal    = bit_cast<uint64_t>(double(constrTestValues.int64));
+         expectedTestValues.uint32CreateVal   = bit_cast<uint64_t>(double(constrTestValues.uint32));
+         expectedTestValues.uint64CreateVal   = bit_cast<uint64_t>(double(constrTestValues.uint64));
+         expectedTestValues.float32CreateVal  = bit_cast<uint64_t>(double(constrTestValues.float32));
+         expectedTestValues.float64CreateVal  = bit_cast<uint64_t>(constrTestValues.float64);
+         expectedTestValues.additionVal       = emulated_float64_t<FastMath, FlushDenormToZero>::create(aAsDouble + bAsDouble).data;
+         expectedTestValues.substractionVal   = emulated_float64_t<FastMath, FlushDenormToZero>::create(aAsDouble - bAsDouble).data;
+         expectedTestValues.multiplicationVal = emulated_float64_t<FastMath, FlushDenormToZero>::create(aAsDouble * bAsDouble).data;
+         expectedTestValues.divisionVal       = emulated_float64_t<FastMath, FlushDenormToZero>::create(aAsDouble / bAsDouble).data;
+         expectedTestValues.lessOrEqualVal    = aAsDouble <= bAsDouble;
+         expectedTestValues.greaterOrEqualVal = aAsDouble >= bAsDouble;
+         expectedTestValues.equalVal          = aAsDouble == bAsDouble;
+         expectedTestValues.notEqualVal       = aAsDouble != bAsDouble;
+         expectedTestValues.lessVal           = aAsDouble < bAsDouble;
+         expectedTestValues.greaterVal        = aAsDouble > bAsDouble;
+      }
+   };
+
+   struct EmulatedFloat64TestOutput
+   {
+      bool cpuTestsSucceed;
+      bool gpuTestsSucceed;
+   };
+
+   EmulatedFloat64TestOutput emulatedFloat64LoopedTests_impl(EF64Submitter& submitter,
+      const uint32_t                                                        iterations,
+      const std::function<double()>&                                        determineValueA,
+      const std::function<double()>&                                        determineValueB)
+   {
+      EmulatedFloat64TestOutput output = {true, true};
+
+      std::uniform_int_distribution  i32Distribution(-std::numeric_limits<int>::max(), std::numeric_limits<int>::max());
+      std::uniform_int_distribution  i64Distribution(-std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max());
+      std::uniform_int_distribution  u32Distribution(-std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max());
+      std::uniform_int_distribution  u64Distribution(-std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max());
+      std::uniform_real_distribution fDistribution(-100000.0, 100000.0);
+
+      std::random_device rd;
+      std::mt19937       mt(rd());
+
+      for (uint32_t i = 0u; i < iterations; ++i)
+      {
+         // generate random test values
+         EmulatedFloat64TestValuesInfo<false, true> testValInfo;
+         double                                     aTmp = determineValueA();
+         double                                     bTmp = determineValueB();
+         testValInfo.a.data                              = reinterpret_cast<emulated_float64_t<false, true>::storage_t&>(aTmp);
+         testValInfo.b.data                              = reinterpret_cast<emulated_float64_t<false, true>::storage_t&>(bTmp);
+         testValInfo.constrTestValues.int32              = i32Distribution(mt);
+         testValInfo.constrTestValues.int64              = i64Distribution(mt);
+         testValInfo.constrTestValues.uint32             = u32Distribution(mt);
+         testValInfo.constrTestValues.uint64             = u64Distribution(mt);
+         testValInfo.constrTestValues.float32            = fDistribution(mt);
+         testValInfo.constrTestValues.float64            = fDistribution(mt);
+
+         testValInfo.fillExpectedTestValues();
+         auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
+
+         if (!singleTestOutput.cpuTestsSucceed)
+            output.cpuTestsSucceed = false;
+         if (!singleTestOutput.gpuTestsSucceed)
+            output.gpuTestsSucceed = false;
+      }
+
+      return output;
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64RandomValuesTest(EF64Submitter& submitter)
+   {
+      auto getRandomFloat64 = []()
+      {
+         static std::random_device             rd;
+         static std::mt19937                   mt(rd());
+         static std::uniform_real_distribution distribution(-100000.0, 100000.0);
+
+
+         return distribution(mt);
+      };
+
+      return emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations, getRandomFloat64, getRandomFloat64);
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64RandomValuesTestContrastingExponents(EF64Submitter& submitter)
+   {
+      auto getRandomSmallFloat64 = []()
+      {
+         static std::random_device             rd;
+         static std::mt19937                   mt(rd());
+         static std::uniform_real_distribution distribution(-0.01, 0.01);
+
+         return distribution(mt);
+      };
+
+      auto getRandomLargeFloat64 = []()
+      {
+         static std::random_device             rd;
+         static std::mt19937                   mt(rd());
+         static std::uniform_real_distribution distribution(1000000000.0, 2000000000.0);
+         static std::uniform_int_distribution  coinFlipDistribution(0, 1);
+
+         double output = distribution(mt);
+         if (coinFlipDistribution(mt))
+            output = -output;
+
+         return output;
+      };
+
+      EmulatedFloat64TestOutput firstTestOutput  = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomSmallFloat64, getRandomLargeFloat64);
+      EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomLargeFloat64, getRandomSmallFloat64);
+
+      EmulatedFloat64TestOutput output;
+      output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
+      output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
+      return output;
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64BothValuesNaNTest(EF64Submitter& submitter)
+   {
+      smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
+
+      EmulatedFloat64TestValuesInfo<false, true> testValInfo;
+      const float32_t                            nan32 = std::numeric_limits<float32_t>::quiet_NaN();
+      const float64_t                            nan64 = std::numeric_limits<float64_t>::quiet_NaN();
+      testValInfo.a                                    = emulated_float64_t<false, true>::create(nan64);
+      testValInfo.b                                    = emulated_float64_t<false, true>::create(nan64);
+      testValInfo.constrTestValues                     = {
+                             .int32   = std::bit_cast<int32_t>(nan32),
+                             .int64   = std::bit_cast<int64_t>(nan64),
+                             .uint32  = std::bit_cast<uint32_t>(nan32),
+                             .uint64  = std::bit_cast<uint64_t>(nan64),
+                             .float32 = nan32
+         //.float64 = nan64
+      };
+
+      testValInfo.fillExpectedTestValues();
+      return performEmulatedFloat64Tests(testValInfo, submitter);
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64NegAndPosZeroTest(EF64Submitter& submitter)
+   {
+      smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
+
+      EmulatedFloat64TestValuesInfo<false, true> testValInfo;
+      testValInfo.a                = emulated_float64_t<false, true>::create(ieee754::traits<float64_t>::signMask);
+      testValInfo.b                = emulated_float64_t<false, true>::create(std::bit_cast<uint64_t>(0.0));
+      testValInfo.constrTestValues = {
+         .int32   = 0,
+         .int64   = 0,
+         .uint32  = 0,
+         .uint64  = 0,
+         .float32 = 0};
+
+      testValInfo.fillExpectedTestValues();
+      auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
+      std::swap(testValInfo.a, testValInfo.b);
+      testValInfo.fillExpectedTestValues();
+      auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
+
+      return {firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed};
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64BothValuesInfTest(EF64Submitter& submitter)
+   {
+      smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
+
+      EmulatedFloat64TestValuesInfo<false, true> testValInfo;
+      const float32_t                            inf32 = std::numeric_limits<float32_t>::infinity();
+      const float64_t                            inf64 = std::numeric_limits<float64_t>::infinity();
+      testValInfo.a                                    = emulated_float64_t<false, true>::create(inf64);
+      testValInfo.b                                    = emulated_float64_t<false, true>::create(inf64);
+      testValInfo.constrTestValues                     = {
+                             .int32   = 0,
+                             .int64   = 0,
+                             .uint32  = 0,
+                             .uint64  = 0,
+                             .float32 = inf32
+         //.float64 = inf64
+      };
+
+      testValInfo.fillExpectedTestValues();
+      return performEmulatedFloat64Tests(testValInfo, submitter);
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64BothValuesNegInfTest(EF64Submitter& submitter)
+   {
+      smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
+
+      EmulatedFloat64TestValuesInfo<false, true> testValInfo;
+      const float32_t                            inf32 = -std::numeric_limits<float32_t>::infinity();
+      const float64_t                            inf64 = -std::numeric_limits<float64_t>::infinity();
+      testValInfo.a                                    = emulated_float64_t<false, true>::create(inf64);
+      testValInfo.b                                    = emulated_float64_t<false, true>::create(inf64);
+      testValInfo.constrTestValues                     = {
+                             .int32   = 0,
+                             .int64   = 0,
+                             .uint32  = 0,
+                             .uint64  = 0,
+                             .float32 = inf32
+         //.float64 = inf64
+      };
+
+      testValInfo.fillExpectedTestValues();
+      return performEmulatedFloat64Tests(testValInfo, submitter);
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64OneValIsInfOtherIsNegInfTest(EF64Submitter& submitter)
+   {
+      smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
+
+      EmulatedFloat64TestValuesInfo<false, true> testValInfo;
+      const float64_t                            inf64 = -std::numeric_limits<float64_t>::infinity();
+      testValInfo.a                                    = emulated_float64_t<false, true>::create(inf64);
+      testValInfo.b                                    = emulated_float64_t<false, true>::create(inf64);
+      testValInfo.constrTestValues                     = {
+                             .int32   = 0,
+                             .int64   = 0,
+                             .uint32  = 0,
+                             .uint64  = 0,
+                             .float32 = 0
+         //.float64 = inf64
+      };
+
+      testValInfo.fillExpectedTestValues();
+      auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
+      std::swap(testValInfo.a, testValInfo.b);
+      testValInfo.fillExpectedTestValues();
+      auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
+
+      return {firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed};
+   }
+
+   // TODO: fix
+   EmulatedFloat64TestOutput emulatedFloat64BNaNTest(EF64Submitter& submitter)
+   {
+      EmulatedFloat64TestOutput    output    = {true, true};
+      smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
+
+      for (uint32_t i = 0u; i < EmulatedFloat64TestIterations; ++i)
+      {
+         std::random_device rd;
+         std::mt19937       mt(rd());
+
+         std::uniform_int_distribution  i32Distribution(-std::numeric_limits<int>::max(), std::numeric_limits<int>::max());
+         std::uniform_int_distribution  i64Distribution(-std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max());
+         std::uniform_int_distribution  u32Distribution(-std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max());
+         std::uniform_int_distribution  u64Distribution(-std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max());
+         std::uniform_real_distribution f32Distribution(-100000.0f, 100000.0f);
+         std::uniform_real_distribution f64Distribution(-100000.0, 100000.0);
+
+         EmulatedFloat64TestValuesInfo<false, true> testValInfo;
+         double                                     aTmp = f64Distribution(mt);
+         double                                     bTmp = std::numeric_limits<float64_t>::quiet_NaN();
+         testValInfo.a.data                              = reinterpret_cast<emulated_float64_t<false, true>::storage_t&>(aTmp);
+         testValInfo.b.data                              = reinterpret_cast<emulated_float64_t<false, true>::storage_t&>(bTmp);
+         testValInfo.constrTestValues.int32              = i32Distribution(mt);
+         testValInfo.constrTestValues.int64              = i64Distribution(mt);
+         testValInfo.constrTestValues.uint32             = u32Distribution(mt);
+         testValInfo.constrTestValues.uint64             = u64Distribution(mt);
+         testValInfo.constrTestValues.float32            = f32Distribution(mt);
+         //testValInfo.constrTestValues.float64 = f64Distribution(mt);
+
+         testValInfo.fillExpectedTestValues();
+         auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter);
+
+         if (!singleTestOutput.cpuTestsSucceed)
+            output.cpuTestsSucceed = false;
+         if (!singleTestOutput.gpuTestsSucceed)
+            output.gpuTestsSucceed = false;
+      }
+
+      return output;
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64OneValIsInfTest(EF64Submitter& submitter)
+   {
+      auto getRandomFloat64 = []()
+      {
+         static std::random_device             rd;
+         static std::mt19937                   mt(rd());
+         static std::uniform_real_distribution distribution(-100000.0, 100000.0);
+
+         return distribution(mt);
+      };
+
+      auto getInfinity = []()
+      {
+         return std::numeric_limits<float64_t>::infinity();
+      };
+
+      EmulatedFloat64TestOutput firstTestOutput  = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getInfinity);
+      EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getInfinity, getRandomFloat64);
+
+      EmulatedFloat64TestOutput output;
+      output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
+      output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
+      return output;
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64OneValIsNegInfTest(EF64Submitter& submitter)
+   {
+      auto getRandomFloat64 = []()
+      {
+         static std::random_device             rd;
+         static std::mt19937                   mt(rd());
+         static std::uniform_real_distribution distribution(-100000.0, 100000.0);
+
+
+         return distribution(mt);
+      };
+
+      auto getNegInfinity = []()
+      {
+         return -std::numeric_limits<float64_t>::infinity();
+      };
+
+      EmulatedFloat64TestOutput firstTestOutput  = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegInfinity);
+      EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegInfinity, getRandomFloat64);
+
+      EmulatedFloat64TestOutput output;
+      output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
+      output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
+      return output;
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64OneValIsZeroTest(EF64Submitter& submitter)
+   {
+      auto getRandomFloat64 = []()
+      {
+         static std::random_device             rd;
+         static std::mt19937                   mt(rd());
+         static std::uniform_real_distribution distribution(-100000.0, 100000.0);
+
+         return distribution(mt);
+      };
+
+      auto getZero = []()
+      {
+         return 0.0;
+      };
+
+      EmulatedFloat64TestOutput firstTestOutput  = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getZero);
+      EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getZero, getRandomFloat64);
+
+      EmulatedFloat64TestOutput output;
+      output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
+      output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
+      return output;
+   }
+
+   EmulatedFloat64TestOutput emulatedFloat64OneValIsNegZeroTest(EF64Submitter& submitter)
+   {
+      auto getRandomFloat64 = []()
+      {
+         static std::random_device             rd;
+         static std::mt19937                   mt(rd());
+         static std::uniform_real_distribution distribution(-100000.0, 100000.0);
+
+         return distribution(mt);
+      };
+
+      auto getNegZero = []()
+      {
+         return -0.0;
+      };
+
+      EmulatedFloat64TestOutput firstTestOutput  = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegZero);
+      EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegZero, getRandomFloat64);
+
+      EmulatedFloat64TestOutput output;
+      output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed;
+      output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed;
+      return output;
+   }
+
+   template<bool FastMath, bool FlushDenormToZero>
+   EmulatedFloat64TestOutput performEmulatedFloat64Tests(EmulatedFloat64TestValuesInfo<FastMath, FlushDenormToZero>& testValInfo, EF64Submitter& submitter)
+   {
+      emulated_float64_t<false, true> a = testValInfo.a;
+      emulated_float64_t<false, true> b = testValInfo.b;
+
+      const TestValues<FastMath, FlushDenormToZero> cpuTestValues = {
+         .int32CreateVal    = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.int32).data,
+         .int64CreateVal    = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.int64).data,
+         .uint32CreateVal   = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.uint32).data,
+         .uint64CreateVal   = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.uint64).data,
+         .float32CreateVal  = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.float32).data,
+         .float64CreateVal  = emulated_float64_t<FastMath, FlushDenormToZero>::create(testValInfo.constrTestValues.float64).data,
+         .additionVal       = (a + b).data,
+         .substractionVal   = (a - b).data,
+         .multiplicationVal = (a * b).data,
+         .divisionVal       = (a / b).data,
+         .lessOrEqualVal    = a <= b,
+         .greaterOrEqualVal = a >= b,
+         .equalVal          = a == b,
+         .notEqualVal       = a != b,
+         .lessVal           = a<b,
+                      .greaterVal = a>
+            b};
+
+      EmulatedFloat64TestOutput output;
+
+      // cpu validation
+      output.cpuTestsSucceed = compareEmulatedFloat64TestValues<false, true, EmulatedFloatTestDevice::CPU>(testValInfo.expectedTestValues, cpuTestValues);
+
+      // gpu validation
+      PushConstants pc;
+      pc.a              = reinterpret_cast<uint64_t&>(a);
+      pc.b              = reinterpret_cast<uint64_t&>(b);
+      pc.constrTestVals = testValInfo.constrTestValues;
+
+      submitter.setPushConstants(pc);
+      auto gpuTestValues = submitter.submitGetGPUTestValues();
+
+      output.gpuTestsSucceed = compareEmulatedFloat64TestValues<false, true, EmulatedFloatTestDevice::GPU>(testValInfo.expectedTestValues, gpuTestValues);
+
+      return output;
+   }
+
+   void runEF64Benchmarks()
+   {
+      constexpr uint32_t WarmupDispatches = 1000;
+      constexpr uint64_t TargetBudgetMs   = 400; // ~400ms per row
+
+      Aggregator agg(m_logger, m_device, m_physicalDevice, getComputeQueue()->getFamilyIndex());
+      agg.applyCli({
+         .argv              = this->argv,
+         .defaultOutputPath = "EF64Bench.json",
+         .appName           = "64_EmulatedFloatTest",
+      });
+
+      const auto shaderKey     = nbl::this_example::builtin::build::get_spirv_key<"benchmark">(m_device.get());
+      auto       shaderVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(shaderKey);
+
+      // One bench instance per mode -> one report row per mode. std::array
+      // gives stack-allocated, pointer-stable storage; no parallel
+      // benchPtrs vector needed since the aggregator iterates the span
+      // directly.
+      constexpr std::pair<EF64_BENCHMARK_MODE, const char*> kModes[] = {
+         {EF64_BENCHMARK_MODE::NATIVE, "native"},
+         {EF64_BENCHMARK_MODE::EF64_FAST_MATH_ENABLED, "emulated, fast-math"},
+         {EF64_BENCHMARK_MODE::EF64_FAST_MATH_DISABLED, "emulated, strict"},
+         {EF64_BENCHMARK_MODE::SUBGROUP_DIVIDED_WORK, "subgroup-divided"},
+         {EF64_BENCHMARK_MODE::INTERLEAVED, "interleaved"},
+      };
+      constexpr size_t            N = std::size(kModes);
+      std::vector<CEF64Benchmark> benches;
+      benches.reserve(N);
+      for (size_t i = 0; i < N; ++i)
+      {
+         const auto& [mode, leaf] = kModes[i];
+         benches.emplace_back(agg, CEF64Benchmark::SetupData{
+                                      .assetMgr         = m_assetMgr,
+                                      .name             = {"EF64", leaf},
+                                      .mode             = mode,
+                                      .variant          = shaderVariant,
+                                      .warmupDispatches = WarmupDispatches,
+                                      .targetBudgetMs   = TargetBudgetMs,
+                                   });
+      }
+
+      const RunContext ctx = {
+         .shape          = CEF64Benchmark::shape(),
+         .targetBudgetMs = TargetBudgetMs,
+         .sectionLabel   = CEF64Benchmark::kSectionLabel,
+      };
+      agg.runSessionAndReport(Aggregator::makeSpan(benches, ctx));
+   }
+
+
+   template<typename... Args>
+   inline bool logFail(const char* msg, Args&&... args)
+   {
+      m_logger->log(msg, ILogger::ELL_ERROR, std::forward<Args>(args)...);
+      return false;
+   }
+
+   std::ofstream m_logFile;
 };
 
 NBL_MAIN_FUNC(CompatibilityTest)
\ No newline at end of file
diff --git a/common/include/nbl/examples/Benchmark/BenchmarkCli.h b/common/include/nbl/examples/Benchmark/BenchmarkCli.h
new file mode 100644
index 000000000..abb0912da
--- /dev/null
+++ b/common/include/nbl/examples/Benchmark/BenchmarkCli.h
@@ -0,0 +1,125 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_COMMON_BENCHMARK_CLI_INCLUDED_
+#define _NBL_COMMON_BENCHMARK_CLI_INCLUDED_
+
+#include <nabla.h>
+#include "nbl/examples/Benchmark/BenchmarkTypes.h"
+
+#include <algorithm>
+#include <charconv>
+#include <filesystem>
+#include <span>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace benchmark_cli
+{
+
+struct ParsedArgs
+{
+   std::string                                             outputPath;
+   bool                                                    noBaseline    = false;
+   bool                                                    noColor       = false;
+   bool                                                    helpRequested = false;
+   std::vector<std::pair<std::string, std::string>>        baselines; // (label, path)
+   nbl::core::vector<nbl::core::vector<nbl::core::string>> focus;
+   // Median-of-K window count used for focused rows (see
+   // IBenchmark::samplesForCurrentRow). Default 3 trades 3 * targetBudgetMs
+   // wall time for jitter-robust comparisons.
+   uint32_t focusSamples = 3;
+};
+
+// Pure: parse argv into a ParsedArgs. Unknown flags are silently ignored;
+// the caller decides what to do on help / no-baseline / per-load failure.
+inline ParsedArgs parseArgs(std::span<const std::string> argv, std::string defaultOutputPath)
+{
+   ParsedArgs out;
+   out.outputPath = std::move(defaultOutputPath);
+
+   for (size_t i = 1; i < argv.size(); ++i)
+   {
+      if (argv[i] == "--output" && i + 1 < argv.size())
+         out.outputPath = argv[++i];
+      else if (argv[i] == "--no-baseline")
+         out.noBaseline = true;
+      else if (argv[i] == "--no-color")
+         out.noColor = true;
+      else if (argv[i] == "--baseline" && i + 1 < argv.size())
+      {
+         const std::string& spec = argv[++i];
+         const auto         eq   = spec.find('=');
+         std::string        label, path;
+         if (eq == std::string::npos)
+         {
+            path            = spec;
+            const auto stem = std::filesystem::path(path).stem().string();
+            label           = stem.empty() ? std::string("baseline") : stem;
+         }
+         else
+         {
+            label = spec.substr(0, eq);
+            path  = spec.substr(eq + 1);
+         }
+         out.baselines.emplace_back(std::move(label), std::move(path));
+      }
+      else if (argv[i] == "--focus" && i + 1 < argv.size())
+      {
+         out.focus.push_back(splitFocusSpec(argv[++i]));
+      }
+      else if (argv[i] == "--focus-samples" && i + 1 < argv.size())
+      {
+         // Clamp to [1, 32]: 1 disables the median+outlier path, 32 is well past
+         // the point of diminishing returns (variance of the trimmed mean drops
+         // ~1/sqrt(K)). from_chars instead of stol to stay no-exceptions per
+         // Nabla style; malformed input leaves the default in place.
+         const std::string& s = argv[++i];
+         long v = 0;
+         const auto [_, ec] = std::from_chars(s.data(), s.data() + s.size(), v);
+         if (ec == std::errc{})
+            out.focusSamples = uint32_t(std::clamp<long>(v, 1, 32));
+      }
+      else if (argv[i] == "--help" || argv[i] == "-h")
+      {
+         out.helpRequested = true;
+      }
+   }
+   return out;
+}
+
+inline void printHelp(nbl::system::ILogger* logger, std::string_view appName, std::string_view defaultOutputPath)
+{
+   benchLogFmt(logger, nbl::system::ILogger::ELL_INFO,
+      "{} CLI:\n"
+      "  --output PATH              write this run's report to PATH (default: {})\n"
+      "  --baseline [LABEL=]PATH    load PATH as a baseline; LABEL becomes the column header ('vs LABEL').\n"
+      "                             repeatable. If LABEL= is omitted, the file's stem is used\n"
+      "                             (e.g. main.json -> 'main'). '=' is used instead of ':' so Windows\n"
+      "                             drive letters in paths don't collide with the separator.\n"
+      "  --no-baseline              skip the default auto-load of the output path\n"
+      "  --no-color                 disable ANSI color in the live table (also honored: NO_COLOR=1 env var)\n"
+      "  --focus NAME               print a focused baseline-comparison table for NAME before the run.\n"
+      "                             NAME is the hierarchical name with '>' between segments (whitespace\n"
+      "                             around '>' is optional). Repeatable; one row per --focus. The first\n"
+      "                             loaded baseline is the reference for inline deltas in this table.\n"
+      "                             Example: --focus \"Linear > Linear > 1:1\"\n"
+      "  --focus-samples N          run each focused row N times (median + outlier rejection) for\n"
+      "                             jitter-robust comparisons. Default 3; clamped to [1, 32]. N=1\n"
+      "                             matches the rest-phase single-shot path. Wall time per focused\n"
+      "                             row scales linearly with N.\n"
+      "  --help, -h                 print this help\n"
+      "\n"
+      "Default behaviour: with no flags, the prior run's output (if present) is loaded as the single\n"
+      "  'baseline', and a fresh one is written at the end; iterate-and-compare with no flags needed.\n"
+      "\n"
+      "Failed loads (missing/corrupt file) log a warning and continue; the corresponding column reads 'n/a'.",
+      appName, defaultOutputPath);
+}
+
+}
+
+#endif
diff --git a/common/include/nbl/examples/Benchmark/BenchmarkConsole.h b/common/include/nbl/examples/Benchmark/BenchmarkConsole.h
new file mode 100644
index 000000000..e857c36d4
--- /dev/null
+++ b/common/include/nbl/examples/Benchmark/BenchmarkConsole.h
@@ -0,0 +1,526 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_COMMON_BENCHMARK_CONSOLE_INCLUDED_
+#define _NBL_COMMON_BENCHMARK_CONSOLE_INCLUDED_
+
+#include <nabla.h>
+#include "nbl/examples/Benchmark/BenchmarkTypes.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <format>
+#include <optional>
+#include <span>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+// Methods templated on the baselines range must expose `.label` and `.rowsByName`.
+class BenchmarkConsole
+{
+   public:
+   BenchmarkConsole()
+   {
+      // https://no-color.org
+      if (const char* nc = std::getenv("NO_COLOR"); nc && nc[0] != '\0')
+         m_useAnsi = false;
+   }
+   explicit BenchmarkConsole(nbl::core::smart_refctd_ptr<nbl::system::ILogger> logger)
+      : BenchmarkConsole()
+   {
+      m_logger = std::move(logger);
+   }
+
+   void                  setLogger(nbl::core::smart_refctd_ptr<nbl::system::ILogger> logger) { m_logger = std::move(logger); }
+   nbl::system::ILogger* getLogger() const { return m_logger.get(); }
+
+   void setSilent(bool s) { m_silent = s; }
+   bool silent() const { return m_silent; }
+
+   void setColorEnabled(bool e) { m_useAnsi = e; }
+   bool colorEnabled() const { return m_useAnsi; }
+
+   // `neutral` is ELL_PERFORMANCE blue (not a full reset) so uncolored cell
+   // parts inherit the logger's line-wrap color. Only correct because rows /
+   // banners are all logged at ELL_PERFORMANCE.
+   struct Ansi
+   {
+      static constexpr std::string_view neutral = "\033[34m";
+      static constexpr std::string_view reset   = "\033[0m";
+      static constexpr std::string_view red     = "\033[31m";
+      static constexpr std::string_view green   = "\033[32m";
+      static constexpr std::string_view yellow  = "\033[33m";
+      static constexpr std::string_view cyan    = "\033[36m";
+      static constexpr std::string_view bold    = "\033[1m";
+   };
+
+   // visualWidth excludes ANSI escape bytes (std::format's `{:>{}}` counts
+   // bytes), so colored cells must be padded manually via padCell.
+   struct CellOut
+   {
+      std::string text;
+      size_t      visualWidth = 0;
+   };
+
+   const Format::Widths& widths() const { return m_widths; }
+   void                  growWidthFor(std::string_view joined) { m_widths.grow(joined); }
+
+   // Sizes int columns to unchanged-value width, float columns to "value
+   // (+/-delta)" with delta=0. Changed-int rows overflow; padding every row
+   // for worst-case wastes ~40% horizontal space on stable runs.
+   void growForBaseline(const BaselineRow& b)
+   {
+      const auto growInt = [&](size_t& w, uint64_t v)
+      {
+         if (v == BaselineRow::kAbsent)
+            return;
+         w = std::max(w, std::format("{}", v).size());
+      };
+      growInt(m_widths.regs,   b.registerCount);
+      growInt(m_widths.code,   b.codeSizeBytes);
+      growInt(m_widths.shared, b.sharedMemBytes);
+      growInt(m_widths.local,  b.privateMemBytes);
+
+      if (b.psPerSample > 0.0)
+      {
+         m_widths.psSample = std::max(m_widths.psSample, floatCellPlainText(b.psPerSample, 0.0).size());
+         const double gsBase = 1000.0 / b.psPerSample;
+         m_widths.gsamples = std::max(m_widths.gsamples, floatCellPlainText(gsBase, 0.0).size());
+      }
+   }
+
+   // Pre-register so the header (logged once up front) doesn't stay narrower than later rows.
+   void registerVariant(std::span<const std::string> name) { m_widths.grow(joinName(name)); }
+   void registerVariant(std::initializer_list<std::string_view> name)
+   {
+      std::vector<std::string> tmp;
+      tmp.reserve(name.size());
+      for (auto s : name)
+         tmp.emplace_back(s);
+      m_widths.grow(joinName(tmp));
+   }
+
+   void logSectionBanner(std::string_view banner) const
+   {
+      if (banner.empty())
+         return;
+      if (m_useAnsi)
+         benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}{}{}{}", Ansi::bold, Ansi::cyan, banner, Ansi::reset);
+      else
+         benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", banner);
+   }
+
+   // Once per session, not per span, otherwise readers see the same text N times.
+   template<typename Baselines>
+   void logBannerNotes(const Baselines& baselines) const
+   {
+      if (std::empty(baselines))
+         return;
+      const auto&       primary      = *std::begin(baselines);
+      const bool        multi        = std::distance(std::begin(baselines), std::end(baselines)) > 1;
+      const std::string primaryLabel = primary.label;
+      benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE,
+         "Note: ps/sample lower = faster; GSamples/s higher = faster. Inline annotations compare to primary baseline '{}': "
+         "floats show 'value (+/-delta)' always; ints show 'old -> new' only when changed.",
+         primaryLabel);
+      if (multi)
+         benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE,
+            "Note: trailing 'vs LABEL' columns carry raw ps/sample deltas against secondary baselines (primary skipped, shown inline).");
+      benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE,
+         "Note: '[WG!]' on a delta = baseline's workload shape (workgroup / dispatch / samplesPerDispatch) differs from this run, comparison is apples-to-oranges.");
+      benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE,
+         "Note: float deltas only get green/red coloring when the relative change is >= {:.0f}% (typical GPU jitter is 1-2%); smaller deltas stay neutral.",
+         kFloatColorThreshold * 100.0);
+   }
+
+   template<typename Baselines>
+   void logHeader(const Baselines& baselines) const
+   {
+      std::string line = std::format("{:<{}} | {:>{}} | {:>{}} | {:>{}} | {:>{}} | {:>{}} | {:>{}}",
+         "Name",       m_widths.name,
+         "ps/sample",  m_widths.psSample,
+         "GSamples/s", m_widths.gsamples,
+         "regs",       m_widths.regs,
+         "code(B)",    m_widths.code,
+         "shared(B)",  m_widths.shared,
+         "local(B)",   m_widths.local);
+      // Primary is shown inline on every value column; only secondaries get trailing columns.
+      size_t idx = 0;
+      for (const auto& b : baselines)
+      {
+         if (idx++ == 0)
+            continue;
+         const std::string col = std::format("vs {}", b.label);
+         line += std::format(" | {:>{}}", col, baselineColWidth(b.label));
+      }
+      benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", line);
+   }
+
+   template<typename Baselines>
+   void logRow(std::span<const std::string> name, std::string_view joinedName,
+      const TimingResult& t, const PipelineStats& s,
+      const std::unordered_map<std::string, BaselineRef>& rowBaselines,
+      const Baselines&                                    baselines) const
+   {
+      if (!m_logger || m_silent)
+         return;
+
+      const BaselineRow* primary = nullptr;
+      if (!std::empty(baselines))
+      {
+         const std::string key = makeKey(name);
+         const auto&       b0  = *std::begin(baselines);
+         if (auto it = b0.rowsByName.find(key); it != b0.rowsByName.end())
+            primary = &it->second;
+      }
+
+      // ps_per_sample * GSamples/s == 1000 (see runTimed), so GSamples is derived not stored.
+      const auto baselineGSamples = primary ? std::optional<double>{primary->psPerSample > 0.0 ? 1000.0 / primary->psPerSample : 0.0} : std::nullopt;
+
+      std::string line = std::format("{:<{}}", joinedName, m_widths.name);
+      line += " | " + padCell(formatFloatCell(t.ps_per_sample,   primary ? std::optional<double>{primary->psPerSample} : std::nullopt, true),  m_widths.psSample);
+      line += " | " + padCell(formatFloatCell(t.gsamples_per_s,  baselineGSamples,                                                    false), m_widths.gsamples);
+      line += " | " + padCell(formatIntCell(s.registerCount,     primary ? primary->registerCount   : BaselineRow::kAbsent),                                     m_widths.regs);
+      line += " | " + padCell(formatIntCell(s.codeSizeBytes,     primary ? primary->codeSizeBytes   : BaselineRow::kAbsent),                                     m_widths.code);
+      line += " | " + padCell(formatIntCell(s.sharedMemBytes,    primary ? primary->sharedMemBytes  : BaselineRow::kAbsent),                                     m_widths.shared);
+      line += " | " + padCell(formatIntCell(s.privateMemBytes,   primary ? primary->privateMemBytes : BaselineRow::kAbsent),                                     m_widths.local);
+
+      size_t idx = 0;
+      for (const auto& b : baselines)
+      {
+         if (idx++ == 0)
+            continue;
+         std::string plain;
+         bool        better      = false;
+         bool        significant = false;
+         bool        haveValue   = false;
+         bool        flagShape   = false;
+         if (auto it = rowBaselines.find(b.label); it != rowBaselines.end() && it->second.psPerSample > 0.0)
+         {
+            const double delta = t.ps_per_sample - it->second.psPerSample;
+            plain       = std::format("{:+.3f}", delta);
+            better      = delta < 0.0;
+            significant = std::abs(delta) / it->second.psPerSample >= kFloatColorThreshold;
+            haveValue   = true;
+            flagShape   = it->second.shapeMismatch;
+         }
+         else
+         {
+            plain = "n/a";
+         }
+         std::string suffix = flagShape ? std::string(" [WG!]") : std::string();
+         CellOut cell;
+         cell.visualWidth = plain.size() + suffix.size();
+         if (!m_useAnsi)
+         {
+            cell.text = plain + suffix;
+         }
+         else
+         {
+            const bool        paint        = haveValue && significant;
+            const std::string_view col     = paint ? (better ? Ansi::green : Ansi::red) : std::string_view{};
+            std::string       coloredPlain = paint
+                                                ? std::format("{}{}{}", col, plain, Ansi::neutral)
+                                                : plain;
+            std::string       coloredSuffix = flagShape
+                                                ? std::format("{}{}{}{}", Ansi::bold, Ansi::red, suffix, Ansi::neutral)
+                                                : std::string();
+            cell.text = coloredPlain + coloredSuffix;
+         }
+         line += " | " + padCell(cell, baselineColWidth(b.label));
+      }
+      benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", line);
+   }
+
+   // Flat table, one row per (variant, stat); each baseline gets one delta column:
+   //
+   //   Name  | stat        | current | vs iter47 | vs iter48
+   //   X     | ps/sample   |   2.151 |   -0.044  |   +0.123
+   //   X     | GSamples/s  |   464.9 |   +9.456  |   -7.234
+   //   X     | regs        |      40 |     +0    |     +0
+   //   X     | code(B)     |    4992 |   +128    |      0
+   template<typename Baselines, typename Results>
+   void printBaselineComparison(std::span<const nbl::core::vector<nbl::core::string>> names,
+      const Baselines& baselines, const Results& results) const
+   {
+      if (!m_logger || names.empty())
+         return;
+      if (std::empty(baselines))
+      {
+         benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_WARNING,
+            "--focus requested {} variant(s) but no baselines are loaded, nothing to compare against. "
+            "Did your --baseline paths fail to load?",
+            names.size());
+         return;
+      }
+
+      struct Current
+      {
+         TimingResult  t;
+         PipelineStats s;
+         Workload      w;
+         bool          present = false;
+      };
+      std::unordered_map<std::string, Current> currentByKey;
+      currentByKey.reserve(std::size(results));
+      for (const auto& r : results)
+         currentByKey[makeKey(r.name)] = {r.timing, r.stats, r.workload, true};
+
+      const size_t baselineCount = static_cast<size_t>(std::distance(std::begin(baselines), std::end(baselines)));
+
+      std::vector<std::vector<CellOut>> rows;
+      rows.reserve(1 + names.size() * 6);
+
+      {
+         auto plainCell = [](std::string s) -> CellOut { const size_t w = s.size(); return {std::move(s), w}; };
+         std::vector<CellOut> header;
+         header.reserve(3 + baselineCount);
+         header.push_back(plainCell("Name"));
+         header.push_back(plainCell("stat"));
+         header.push_back(plainCell("current"));
+         for (const auto& b : baselines)
+            header.push_back(plainCell(std::format("vs {}", b.label)));
+         rows.push_back(std::move(header));
+      }
+
+      auto floatStatRow = [&](const char* label, std::string_view joined, bool have, double curV,
+                               const Workload& curW, const std::string& key,
+                               auto baselineLookup /*BaselineRow -> double*/, bool lowerIsBetter)
+      {
+         auto plainCell = [](std::string s) -> CellOut { const size_t w = s.size(); return {std::move(s), w}; };
+         std::vector<CellOut> row;
+         row.reserve(3 + baselineCount);
+         row.push_back(plainCell(std::string(joined)));
+         row.push_back(plainCell(label));
+         row.push_back(have ? plainCell(formatFloat5(curV)) : plainCell("n/a"));
+
+         for (const auto& b : baselines)
+         {
+            auto bit = b.rowsByName.find(key);
+            if (!have || bit == b.rowsByName.end())
+            {
+               row.push_back(plainCell("n/a"));
+               continue;
+            }
+            const double baseV = baselineLookup(bit->second);
+            if (baseV <= 0.0)
+            {
+               row.push_back(plainCell("n/a"));
+               continue;
+            }
+            const bool        shapeMismatch = curW.present() && bit->second.workload.present() && (curW.shape != bit->second.workload.shape);
+            const double      delta         = curV - baseV;
+            const std::string deltaStr      = std::format("{}{}", delta >= 0 ? "+" : "-", formatFloat5(std::abs(delta)));
+            const bool        significant   = std::abs(delta) / baseV >= kFloatColorThreshold;
+            const std::string suffix        = shapeMismatch ? std::string(" [WG!]") : std::string();
+            CellOut           cell;
+            cell.visualWidth = deltaStr.size() + suffix.size();
+            if (!m_useAnsi || !significant)
+            {
+               cell.text = m_useAnsi && shapeMismatch
+                              ? std::format("{}{}{}{}{}", deltaStr, Ansi::bold, Ansi::red, suffix, Ansi::neutral)
+                              : deltaStr + suffix;
+            }
+            else
+            {
+               const bool             better = (lowerIsBetter && delta < 0.0) || (!lowerIsBetter && delta > 0.0);
+               const std::string_view col    = better ? Ansi::green : Ansi::red;
+               std::string            coloredDelta  = std::format("{}{}{}", col, deltaStr, Ansi::neutral);
+               std::string            coloredSuffix = shapeMismatch
+                                                         ? std::format("{}{}{}{}", Ansi::bold, Ansi::red, suffix, Ansi::neutral)
+                                                         : std::string();
+               cell.text = coloredDelta + coloredSuffix;
+            }
+            row.push_back(std::move(cell));
+         }
+         rows.push_back(std::move(row));
+      };
+
+      auto intStatRow = [&](const char* label, std::string_view joined, bool have, uint64_t curV,
+                              const Workload& curW, const std::string& key, uint64_t BaselineRow::* baseField)
+      {
+         auto plainCell = [](std::string s) -> CellOut { const size_t w = s.size(); return {std::move(s), w}; };
+         std::vector<CellOut> row;
+         row.reserve(3 + baselineCount);
+         row.push_back(plainCell(std::string(joined)));
+         row.push_back(plainCell(label));
+         row.push_back(have ? plainCell(std::format("{}", curV)) : plainCell("n/a"));
+
+         for (const auto& b : baselines)
+         {
+            auto bit = b.rowsByName.find(key);
+            if (!have || bit == b.rowsByName.end())
+            {
+               row.push_back(plainCell("n/a"));
+               continue;
+            }
+            const uint64_t baseV = bit->second.*baseField;
+            if (baseV == BaselineRow::kAbsent)
+            {
+               row.push_back(plainCell("n/a"));
+               continue;
+            }
+            const bool        shapeMismatch = curW.present() && bit->second.workload.present() && (curW.shape != bit->second.workload.shape);
+            const int64_t     delta         = int64_t(curV) - int64_t(baseV);
+            const std::string deltaStr      = std::format("{:+d}", delta);
+            const std::string suffix        = shapeMismatch ? std::string(" [WG!]") : std::string();
+            CellOut           cell;
+            cell.visualWidth = deltaStr.size() + suffix.size();
+            if (!m_useAnsi)
+            {
+               cell.text = deltaStr + suffix;
+            }
+            else
+            {
+               std::string coloredDelta  = delta != 0
+                                              ? std::format("{}{}{}", Ansi::yellow, deltaStr, Ansi::neutral)
+                                              : deltaStr;
+               std::string coloredSuffix = shapeMismatch
+                                              ? std::format("{}{}{}{}", Ansi::bold, Ansi::red, suffix, Ansi::neutral)
+                                              : std::string();
+               cell.text = coloredDelta + coloredSuffix;
+            }
+            row.push_back(std::move(cell));
+         }
+         rows.push_back(std::move(row));
+      };
+
+      for (const auto& nameVec : names)
+      {
+         const std::string joined = joinName(nameVec);
+         const std::string key    = makeKey(nameVec);
+         const auto        cit    = currentByKey.find(key);
+         const bool        have   = (cit != currentByKey.end()) && cit->second.present;
+         const auto&       t      = have ? cit->second.t : TimingResult {};
+         const auto&       s      = have ? cit->second.s : PipelineStats {};
+         const auto&       w      = have ? cit->second.w : Workload {};
+
+         floatStatRow("ps/sample",  joined, have, t.ps_per_sample,  w, key,
+            [](const BaselineRow& b) { return b.psPerSample; }, true);
+         floatStatRow("GSamples/s", joined, have, t.gsamples_per_s, w, key,
+            [](const BaselineRow& b) { return b.psPerSample > 0.0 ? 1000.0 / b.psPerSample : 0.0; }, false);
+         intStatRow("regs",      joined, have, s.registerCount,   w, key, &BaselineRow::registerCount);
+         intStatRow("code(B)",   joined, have, s.codeSizeBytes,   w, key, &BaselineRow::codeSizeBytes);
+         intStatRow("shared(B)", joined, have, s.sharedMemBytes,  w, key, &BaselineRow::sharedMemBytes);
+         intStatRow("local(B)",  joined, have, s.privateMemBytes, w, key, &BaselineRow::privateMemBytes);
+      }
+
+      const size_t        nCols = 3 + baselineCount;
+      std::vector<size_t> colWidths(nCols, 0);
+      for (const auto& r : rows)
+         for (size_t i = 0; i < r.size() && i < nCols; ++i)
+            colWidths[i] = std::max(colWidths[i], r[i].visualWidth);
+
+      benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE,
+         "=== Focus comparison ({} variant(s) vs {} baseline(s); ps/sample lower is better, integer deltas are absolute) ===",
+         names.size(), baselineCount);
+      auto leftPad = [](const CellOut& c, size_t targetWidth) -> std::string
+      {
+         if (c.visualWidth >= targetWidth)
+            return c.text;
+         return c.text + std::string(targetWidth - c.visualWidth, ' ');
+      };
+      for (size_t ri = 0; ri < rows.size(); ++ri)
+      {
+         std::string line;
+         for (size_t ci = 0; ci < rows[ri].size(); ++ci)
+         {
+            if (ci)
+               line.append(" | ");
+            if (ci <= 1)
+               line += leftPad(rows[ri][ci], colWidths[ci]);
+            else
+               line += padCell(rows[ri][ci], colWidths[ci]);
+         }
+         benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", line);
+      }
+   }
+
+   private:
+   static constexpr size_t kBaselineMinColWidth = 10;
+   size_t                  baselineColWidth(std::string_view label) const
+   {
+      return std::max<size_t>(kBaselineMinColWidth, std::string_view("vs ").size() + label.size());
+   }
+
+   // Typical GPU jitter is 1-2%; coloring below 5% would mostly highlight noise.
+   static constexpr double kFloatColorThreshold = 0.05;
+
+   // std::format counts ANSI escape bytes, so `{:>N}` can't pad colored cells.
+   std::string padCell(const CellOut& c, size_t targetWidth) const
+   {
+      if (c.visualWidth >= targetWidth)
+         return c.text;
+      return std::string(targetWidth - c.visualWidth, ' ') + c.text;
+   }
+
+   // "regs 40 -> 54" is more useful than "+14 from somewhere", show both endpoints.
+   CellOut formatIntCell(uint64_t current, uint64_t baseline) const
+   {
+      if (baseline == BaselineRow::kAbsent || baseline == current)
+      {
+         auto s = std::format("{}", current);
+         const size_t w = s.size();
+         return {std::move(s), w};
+      }
+      const std::string baseStr = std::format("{}", baseline);
+      const std::string curStr  = std::format("{}", current);
+      const std::string plain   = std::format("{} -> {}", baseStr, curStr);
+      const size_t      visW    = plain.size();
+      if (!m_useAnsi)
+         return {plain, visW};
+      auto colored = std::format("{}{} -> {}{}", Ansi::yellow, baseStr, curStr, Ansi::neutral);
+      return {std::move(colored), visW};
+   }
+
+   // ~5 chars including the decimal point, so column widths stay predictable
+   // across ps/sample (0.5..100) and GSamples/s (0.03..1000+).
+   static std::string formatFloat5(double v)
+   {
+      const double mag = std::abs(v);
+      if (mag >= 10000.0) return std::format("{:.0f}", v);
+      if (mag >= 1000.0)  return std::format("{:.1f}", v);
+      if (mag >= 100.0)   return std::format("{:.1f}", v);
+      if (mag >= 10.0)    return std::format("{:.2f}", v);
+      return std::format("{:.3f}", v);
+   }
+
+   static std::string floatCellPlainText(double value, double delta)
+   {
+      const std::string deltaStr = std::format("{}{}", delta >= 0 ? "+" : "-", formatFloat5(std::abs(delta)));
+      return std::format("{} ({})", formatFloat5(value), deltaStr);
+   }
+
+   CellOut formatFloatCell(double current, std::optional<double> baseline, bool lowerIsBetter) const
+   {
+      if (!baseline.has_value() || *baseline <= 0.0)
+      {
+         auto s = formatFloat5(current);
+         const size_t w = s.size();
+         return {std::move(s), w};
+      }
+      const double      delta    = current - *baseline;
+      const std::string plain    = floatCellPlainText(current, delta);
+      const size_t      visW     = plain.size();
+      const bool        significant = std::abs(delta) / *baseline >= kFloatColorThreshold;
+      if (!m_useAnsi || !significant)
+         return {plain, visW};
+      const std::string      valStr   = formatFloat5(current);
+      const std::string      deltaStr = std::format("{}{}", delta >= 0 ? "+" : "-", formatFloat5(std::abs(delta)));
+      const bool             better   = (lowerIsBetter && delta < 0.0) || (!lowerIsBetter && delta > 0.0);
+      const std::string_view color    = better ? Ansi::green : Ansi::red;
+      auto                   colored = std::format("{} ({}{}{})", valStr, color, deltaStr, Ansi::neutral);
+      return {std::move(colored), visW};
+   }
+
+   nbl::core::smart_refctd_ptr<nbl::system::ILogger> m_logger;
+   Format::Widths                                    m_widths;
+   bool                                              m_silent  = false;
+   bool                                              m_useAnsi = true;
+};
+
+#endif
diff --git a/common/include/nbl/examples/Benchmark/BenchmarkJson.h b/common/include/nbl/examples/Benchmark/BenchmarkJson.h
new file mode 100644
index 000000000..dc7846848
--- /dev/null
+++ b/common/include/nbl/examples/Benchmark/BenchmarkJson.h
@@ -0,0 +1,285 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_COMMON_BENCHMARK_JSON_INCLUDED_
+#define _NBL_COMMON_BENCHMARK_JSON_INCLUDED_
+
+#include <nabla.h>
+#include "nbl/examples/Benchmark/BenchmarkTypes.h"
+#include "nlohmann/json.hpp"
+
+#include <algorithm>
+#include <fstream>
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+namespace benchmark_json
+{
+
+// Builds the "device" JSON object from a physical device, or null if dev is null.
+inline nlohmann::json buildDeviceMetadata(const nbl::video::IPhysicalDevice* dev)
+{
+   if (!dev)
+      return nullptr;
+   const auto&    p     = dev->getProperties();
+   nlohmann::json out   = nlohmann::json::object();
+   out["name"]          = std::string(p.deviceName);
+   out["vendorID"]      = p.vendorID;
+   out["deviceID"]      = p.deviceID;
+   out["driverID"]      = static_cast<int>(p.driverID);
+   out["driverName"]    = std::string(p.driverName);
+   out["driverInfo"]    = std::string(p.driverInfo);
+   out["driverVersion"] = p.driverVersion;
+   out["deviceUUID"]    = std::vector<uint8_t>(p.deviceUUID, p.deviceUUID + 16);
+   out["driverUUID"]    = std::vector<uint8_t>(p.driverUUID, p.driverUUID + 16);
+   return out;
+}
+
+// Parses a JSON report file into a Baseline. Returns nullopt on missing /
+// unparseable / empty file. Caller is responsible for appending / replacing
+// in their baseline store and for feeding rows into BenchmarkConsole widths.
+inline std::optional<Baseline> loadBaselineFile(std::string label, const std::string& path)
+{
+   std::ifstream f(path);
+   if (!f.is_open())
+      return std::nullopt;
+
+   nlohmann::json j;
+   try
+   {
+      f >> j;
+   }
+   catch (const std::exception&)
+   {
+      return std::nullopt;
+   }
+
+   const auto resultsIt = j.find("results");
+   if (resultsIt == j.end() || !resultsIt->is_array())
+      return std::nullopt;
+
+   std::unordered_map<std::string, BaselineRow> rowsByName;
+   for (const auto& r : *resultsIt)
+   {
+      const auto n  = r.find("name");
+      const auto ps = r.find("ps_per_sample");
+      if (n == r.end() || ps == r.end())
+         continue;
+      if (!n->is_array() || !ps->is_number())
+         continue;
+      std::vector<std::string> nameVec;
+      nameVec.reserve(n->size());
+      for (const auto& seg : *n)
+      {
+         if (!seg.is_string())
+         {
+            nameVec.clear();
+            break;
+         }
+         nameVec.emplace_back(seg.get<std::string>());
+      }
+      if (nameVec.empty())
+         continue;
+         
+      BaselineRow row;
+      row.psPerSample     = ps->get<double>();
+      row.registerCount   = r.at("regs").get<uint64_t>();
+      row.codeSizeBytes   = r.at("code_bytes").get<uint64_t>();
+      row.sharedMemBytes  = r.at("shared_mem_bytes").get<uint64_t>();
+      row.privateMemBytes = r.at("local_mem_bytes").get<uint64_t>();
+      row.stackBytes      = r.at("stack_bytes").get<uint64_t>();
+      row.subgroupSize    = r.at("subgroup_size").get<uint64_t>();
+
+      auto readUvec3 = [&](const char* key, nbl::hlsl::uint32_t3& out)
+      {
+         const auto& a = r.at(key);
+         out.x         = a[0].get<uint32_t>();
+         out.y         = a[1].get<uint32_t>();
+         out.z         = a[2].get<uint32_t>();
+      };
+      readUvec3("workgroup_size", row.workload.shape.workgroupSize);
+      readUvec3("dispatch_groups", row.workload.shape.dispatchGroupCount);
+      row.workload.shape.samplesPerDispatch = r.at("samples_per_dispatch").get<uint64_t>();
+      row.workload.benchDispatches          = r.at("bench_dispatches").get<uint32_t>();
+
+      rowsByName[makeKey(nameVec)] = row;
+   }
+   if (rowsByName.empty())
+      return std::nullopt;
+
+   return Baseline {std::move(label), path, j.at("device"), std::move(rowsByName)};
+}
+
+// Writes a JSON report. Preserves rows in the prior file whose names weren't
+// re-measured this run, so writeReportFile can be an intermediate checkpoint
+// during a multi-bench-class session. Returns preservedCount via out-param.
+inline bool writeReportFile(const std::string& path, const nlohmann::json& deviceMetadata, const std::vector<Baseline>& baselines, const std::vector<Result>& results, nbl::system::ILogger* logger, size_t* outPreservedCount = nullptr)
+{
+   nlohmann::json doc;
+   doc["version"] = 1;
+
+   if (!deviceMetadata.is_null())
+      doc["device"] = deviceMetadata;
+
+   if (!baselines.empty())
+   {
+      auto& baselinesNode = doc["baselines"] = nlohmann::json::object();
+      for (const auto& b : baselines)
+         baselinesNode[b.label] = b.path;
+   }
+   auto& resultsNode = doc["results"] = nlohmann::json::array();
+
+   std::unordered_set<std::string> currentKeys;
+   currentKeys.reserve(results.size());
+   for (const auto& r : results)
+      currentKeys.insert(makeKey(r.name));
+
+   for (const auto& r : results)
+   {
+      nlohmann::json row;
+      row["name"]             = r.name;
+      row["ps_per_sample"]    = r.timing.ps_per_sample;
+      row["gsamples_per_s"]   = r.timing.gsamples_per_s;
+      row["ms_total"]         = r.timing.ms_total;
+      row["regs"]             = r.stats.registerCount;
+      row["code_bytes"]       = r.stats.codeSizeBytes;
+      row["shared_mem_bytes"] = r.stats.sharedMemBytes;
+      row["local_mem_bytes"]  = r.stats.privateMemBytes;
+      row["stack_bytes"]      = r.stats.stackBytes;
+      row["subgroup_size"]    = r.stats.subgroupSize;
+
+      // Structured so JSON preserves the exact numeric type.
+      if (!r.stats.unknowns.empty())
+      {
+         using F   = nbl::video::IGPUPipelineBase::SExecutableStatistic::FORMAT;
+         auto& arr = row["unknown_stats"] = nlohmann::json::array();
+         for (const auto& s : r.stats.unknowns)
+         {
+            nlohmann::json entry;
+            entry["name"] = s.name;
+            switch (s.format)
+            {
+               case F::BOOL32:
+                  entry["type"]  = "bool";
+                  entry["value"] = s.value.b32;
+                  break;
+               case F::INT64:
+                  entry["type"]  = "int";
+                  entry["value"] = s.value.i64;
+                  break;
+               case F::UINT64:
+                  entry["type"]  = "uint";
+                  entry["value"] = s.value.u64;
+                  break;
+               case F::FLOAT64:
+                  entry["type"]  = "float";
+                  entry["value"] = s.value.f64;
+                  break;
+            }
+            arr.push_back(std::move(entry));
+         }
+      }
+
+      row["workgroup_size"]       = {r.workload.shape.workgroupSize.x, r.workload.shape.workgroupSize.y, r.workload.shape.workgroupSize.z};
+      row["dispatch_groups"]      = {r.workload.shape.dispatchGroupCount.x, r.workload.shape.dispatchGroupCount.y, r.workload.shape.dispatchGroupCount.z};
+      row["samples_per_dispatch"] = r.workload.shape.samplesPerDispatch;
+      row["bench_dispatches"]     = r.workload.benchDispatches;
+
+      resultsNode.push_back(std::move(row));
+   }
+
+   // Caveat: renamed/removed variants linger forever. Delete the output JSON
+   // to get a clean slate.
+   size_t preservedCount = 0;
+   {
+      std::ifstream in(path);
+      if (in.is_open())
+      {
+         nlohmann::json existing;
+         try
+         {
+            in >> existing;
+         }
+         catch (const std::exception&)
+         {
+            existing = nullptr;
+         }
+         const auto rIt = existing.find("results");
+         if (rIt != existing.end() && rIt->is_array())
+         {
+            for (const auto& priorRow : *rIt)
+            {
+               const auto n = priorRow.find("name");
+               if (n == priorRow.end() || !n->is_array())
+                  continue;
+               std::vector<std::string> nameVec;
+               bool                     ok = true;
+               for (const auto& seg : *n)
+               {
+                  if (!seg.is_string())
+                  {
+                     ok = false;
+                     break;
+                  }
+                  nameVec.emplace_back(seg.get<std::string>());
+               }
+               if (!ok || nameVec.empty())
+                  continue;
+               if (currentKeys.find(makeKey(nameVec)) != currentKeys.end())
+                  continue; // re-measured this run
+
+               resultsNode.push_back(priorRow);
+               ++preservedCount;
+            }
+         }
+      }
+   }
+
+   std::ofstream f(path, std::ios::out | std::ios::trunc);
+   if (!f.is_open())
+   {
+      benchLogFmt(logger, nbl::system::ILogger::ELL_ERROR, "benchmark_json::writeReportFile: failed to open '{}'", path);
+      return false;
+   }
+
+   // One result per line keeps `git diff` showing one row per change instead
+   // of N lines per row.
+   f << "{\n";
+   f << "  \"version\": " << doc["version"].dump() << ",\n";
+   if (doc.contains("device"))
+   {
+      // Compact value render so byte arrays (deviceUUID etc.) stay inline.
+      const auto& dev = doc["device"];
+      f << "  \"device\": {\n";
+      bool first = true;
+      for (auto it = dev.begin(); it != dev.end(); ++it)
+      {
+         if (!first)
+            f << ",\n";
+         first = false;
+         f << "    \"" << it.key() << "\": " << it.value().dump();
+      }
+      f << "\n  },\n";
+   }
+   if (doc.contains("baselines"))
+      f << "  \"baselines\": " << doc["baselines"].dump() << ",\n";
+   f << "  \"results\": [";
+   for (size_t i = 0; i < resultsNode.size(); ++i)
+   {
+      f << (i ? ",\n    " : "\n    ");
+      f << resultsNode[i].dump();
+   }
+   f << (resultsNode.empty() ? "]\n" : "\n  ]\n");
+   f << "}\n";
+
+   if (outPreservedCount)
+      *outPreservedCount = preservedCount;
+   return true;
+}
+
+} // namespace benchmark_json
+
+#endif
diff --git a/common/include/nbl/examples/Benchmark/BenchmarkTypes.h b/common/include/nbl/examples/Benchmark/BenchmarkTypes.h
new file mode 100644
index 000000000..274c19514
--- /dev/null
+++ b/common/include/nbl/examples/Benchmark/BenchmarkTypes.h
@@ -0,0 +1,211 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_COMMON_BENCHMARK_TYPES_INCLUDED_
+#define _NBL_COMMON_BENCHMARK_TYPES_INCLUDED_
+
+#include <nabla.h>
+#include "nlohmann/json.hpp"
+
+#include <algorithm>
+#include <format>
+#include <limits>
+#include <span>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+struct PipelineStats
+{
+   uint64_t    registerCount   = 0;
+   uint64_t    codeSizeBytes   = 0;
+   uint64_t    sharedMemBytes  = 0;
+   uint64_t    privateMemBytes = 0;
+   uint64_t    stackBytes      = 0;
+   uint32_t    subgroupSize    = 0;
+   std::string raw;
+
+   // Driver stats matchStat didn't recognise. Structured (not lossy-stringified
+   // into `raw`) so JSON round-trips the correct numeric type.
+   std::vector<nbl::video::IGPUPipelineBase::SExecutableStatistic> unknowns;
+};
+
+struct TimingResult
+{
+   float64_t elapsed_ns     = 0.0;
+   uint64_t  totalSamples   = 0;
+   float64_t ps_per_sample  = 0.0;
+   float64_t gsamples_per_s = 0.0;
+   float64_t ms_total       = 0.0;
+};
+
+struct Format
+{
+   struct Widths
+   {
+      size_t name     = std::string_view("Name").size();
+      size_t psSample = std::string_view("ps/sample").size();
+      size_t gsamples = std::string_view("GSamples/s").size();
+      size_t regs     = std::string_view("regs").size();
+      size_t code     = std::string_view("code(B)").size();
+      size_t shared   = std::string_view("shared(B)").size();
+      size_t local    = std::string_view("local(B)").size();
+
+      void grow(std::string_view joinedName) { name = std::max(name, joinedName.size()); }
+   };
+
+   static std::string headerBase(const Widths& w = {})
+   {
+      return std::format("{:<{}} | {:>12} | {:>12} | {:>6} | {:>8} | {:>12} | {:>12}",
+         "Name", w.name, "ps/sample", "GSamples/s", "regs", "code(B)", "shared(B)", "local(B)");
+   }
+
+   static std::string dataBase(const Widths& w, std::string_view joinedName, const TimingResult& t, const PipelineStats& s)
+   {
+      return std::format("{:<{}} | {:>12.3f} | {:>12.3f} | {:>6} | {:>8} | {:>12} | {:>12}",
+         joinedName, w.name, t.ps_per_sample, t.gsamples_per_s, s.registerCount, s.codeSizeBytes, s.sharedMemBytes, s.privateMemBytes);
+   }
+};
+
+// The "what was measured" part of a workload. Workload (adds benchDispatches)
+// and RunContext (adds banner label + budget) both embed a WorkloadShape, so
+// the shape can be sliced into either from the other.
+struct WorkloadShape
+{
+   nbl::hlsl::uint32_t3 workgroupSize      = {0, 0, 0};
+   nbl::hlsl::uint32_t3 dispatchGroupCount = {0, 0, 0};
+   uint64_t             samplesPerDispatch = 0;
+
+   inline bool operator==(const WorkloadShape& other) const
+   {
+      return workgroupSize == other.workgroupSize && dispatchGroupCount == other.dispatchGroupCount && samplesPerDispatch == other.samplesPerDispatch;
+   }
+
+   inline bool operator!=(const WorkloadShape& other) const
+   {
+      return !(*this == other);
+   }
+};
+
+struct Workload
+{
+   WorkloadShape shape;
+   uint32_t      benchDispatches = 0;
+
+   // Default-constructed (all zeros) signals "not recorded".
+   bool present() const { return shape.samplesPerDispatch != 0; }
+};
+
+struct BaselineRow
+{
+   // UINT64_MAX sentinel: no real pipeline stat reaches that magnitude, so an
+   // "absent" field can't collide with a real value. The current run can also
+   // produce kAbsent when a driver doesn't expose a given stat.
+   static constexpr uint64_t kAbsent = std::numeric_limits<uint64_t>::max();
+
+   float64_t psPerSample     = 0.0;
+   uint64_t  registerCount   = kAbsent;
+   uint64_t  codeSizeBytes   = kAbsent;
+   uint64_t  sharedMemBytes  = kAbsent;
+   uint64_t  privateMemBytes = kAbsent;
+   uint64_t  stackBytes      = kAbsent;
+   uint64_t  subgroupSize    = kAbsent; // uint64_t (not 32) to share kAbsent semantics
+   Workload  workload {};
+};
+
+// Per-baseline reference for a single row: the baseline's ps/sample plus
+// whether its recorded workload shape differs from this run (renders the
+// "[WG!]" marker so the reader knows the comparison is questionable).
+struct BaselineRef
+{
+   float64_t psPerSample   = 0.0;
+   bool      shapeMismatch = false;
+};
+
+struct Result
+{
+   // Hierarchical name, outermost first. Tooling can group by any prefix; the
+   // console joins with " > ".
+   nbl::core::vector<nbl::core::string>         name;
+   TimingResult                                 timing {};
+   PipelineStats                                stats {};
+   Workload                                     workload {};
+   std::unordered_map<std::string, BaselineRef> baselines;
+};
+
+inline std::string joinName(std::span<const std::string> name, std::string_view sep = " > ")
+{
+   std::string out;
+   for (size_t i = 0; i < name.size(); ++i)
+   {
+      if (i)
+         out.append(sep);
+      out.append(name[i]);
+   }
+   return out;
+}
+
+// Unit-separator (\x1f) between segments so makeKey can't collide with any
+// user-supplied content.
+inline std::string makeKey(std::span<const std::string> name)
+{
+   std::string k;
+   size_t      total = 0;
+   for (const auto& s : name)
+      total += s.size() + 1;
+   k.reserve(total);
+   for (size_t i = 0; i < name.size(); ++i)
+   {
+      if (i)
+         k.push_back('\x1f');
+      k.append(name[i]);
+   }
+   return k;
+}
+
+inline nbl::core::vector<nbl::core::string> splitFocusSpec(std::string_view spec)
+{
+   auto trim = [](std::string_view s)
+   {
+      while (!s.empty() && (s.front() == ' ' || s.front() == '\t'))
+         s.remove_prefix(1);
+      while (!s.empty() && (s.back() == ' ' || s.back() == '\t'))
+         s.remove_suffix(1);
+      return s;
+   };
+   nbl::core::vector<nbl::core::string> out;
+   size_t                               start = 0;
+   while (start <= spec.size())
+   {
+      size_t end = spec.find('>', start);
+      if (end == std::string_view::npos)
+         end = spec.size();
+      const auto seg = trim(spec.substr(start, end - start));
+      if (!seg.empty())
+         out.emplace_back(seg);
+      if (end == spec.size())
+         break;
+      start = end + 1;
+   }
+   return out;
+}
+
+struct Baseline
+{
+   std::string                                  label;
+   std::string                                  path;
+   nlohmann::json                               device; // top-level "device" field from the file, or null if absent
+   std::unordered_map<std::string, BaselineRow> rowsByName; // makeKey(name) -> stats
+};
+
+template<typename... Args>
+inline void benchLogFmt(nbl::system::ILogger* logger, nbl::system::ILogger::E_LOG_LEVEL level, std::string_view fmt, const Args&... args)
+{
+   if (!logger)
+      return;
+   logger->log("%s", level, std::vformat(fmt, std::make_format_args(args...)).c_str());
+}
+
+#endif
diff --git a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h
new file mode 100644
index 000000000..b5bd69a13
--- /dev/null
+++ b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h
@@ -0,0 +1,693 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_COMMON_GPU_BENCHMARK_HELPER_INCLUDED_
+#define _NBL_COMMON_GPU_BENCHMARK_HELPER_INCLUDED_
+
+#include <nabla.h>
+#include "nbl/examples/examples.hpp"
+#include "nbl/examples/Benchmark/BenchmarkTypes.h"
+#include "nbl/asset/utils/CCompilerSet.h"
+#include "nbl/asset/utils/IShaderCompiler.h"
+
+#include <algorithm>
+#include <functional>
+#include <ranges>
+#include <span>
+#include <string>
+#include <string_view>
+#include <vector>
+
+class GPUBenchmarkHelper
+{
+public:
+   struct InitData
+   {
+      nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice> device;
+      nbl::core::smart_refctd_ptr<nbl::system::ILogger>       logger;
+      nbl::video::IPhysicalDevice*                            physicalDevice     = nullptr;
+      uint32_t                                                computeFamilyIndex = 0;
+      nbl::hlsl::uint32_t3                                    dispatchGroupCount = {0, 0, 0};
+      uint64_t                                                samplesPerDispatch = 0;
+   };
+
+   // One shader source for a benchmark variant. Picks ONE of two paths:
+   //   * Precompiled: `precompiledKey` is a SPIRV asset key from CMake-time
+   //     NBL_CREATE_NSC_COMPILE_RULES. `defines` is ignored.
+   //   * Runtime: `sourcePath` is an .hlsl file resolved against "app_resources",
+   //     compiled at load time with `defines` as -D macros. Use this for fast
+   //     variant iteration without reconfiguring CMake.
+   struct ShaderVariant
+   {
+      // SMacroDefinition uses string_view; this struct owns the backing strings.
+      struct Define
+      {
+         std::string identifier;
+         std::string definition;
+      };
+
+      std::string                         sourcePath;
+      std::string                         precompiledKey;
+      std::vector<Define>                 defines;
+      nbl::asset::IShader::E_SHADER_STAGE stage = nbl::asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
+
+      static ShaderVariant Precompiled(std::string key)
+      {
+         ShaderVariant v;
+         v.precompiledKey = std::move(key);
+         return v;
+      }
+      static ShaderVariant FromSource(std::string path, std::vector<Define> defs = {}, nbl::asset::IShader::E_SHADER_STAGE stage = nbl::asset::IShader::E_SHADER_STAGE::ESS_COMPUTE)
+      {
+         ShaderVariant v;
+         v.sourcePath = std::move(path);
+         v.defines    = std::move(defs);
+         v.stage      = stage;
+         return v;
+      }
+
+      bool isRuntime() const { return !sourcePath.empty() && precompiledKey.empty(); }
+      bool isPrecompiled() const { return !precompiledKey.empty(); }
+   };
+
+   // Layout: [bindOnce] [warmup x dispatchOne][ts0][bench x dispatchOne][ts1][cooldown x dispatchOne]
+   // Cooldown == warmup so the measured window isn't on a winding-down tail.
+   // Putting binds inside dispatchOne adds per-iteration cmdbuf overhead that
+   // shows up in ps/sample on tight shaders.
+   using DispatchFn = std::function<void(nbl::video::IGPUCommandBuffer*)>;
+
+   // Input choice for createBindings(). Output is always implicit BDA.
+   enum class InputBuffer : uint8_t
+   {
+      None,
+      BDA,
+      SSBO,
+      UBO,
+   };
+
+   struct BindingsConfig
+   {
+      size_t      outputBytes       = 0;
+      size_t      pushConstantBytes = 0;
+      size_t      inputBytes        = 0;
+      InputBuffer inputMode         = InputBuffer::None;
+   };
+
+   struct Bindings
+   {
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer>         outputBuf;
+      uint64_t                                                    outputAddress = 0;
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUPipelineLayout> pipelineLayout;
+
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> inputBuf;
+      uint64_t                                            inputAddress = 0; // BDA mode only
+
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSetLayout> dsLayout;
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSet>       ds;
+   };
+
+   struct PipelineEntry
+   {
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUComputePipeline> pipeline;
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUPipelineLayout>  layout;
+      PipelineStats                                                stats;
+      std::string                                                  tag;
+   };
+
+   // Common bindOnce body: bind pipeline + upload push constants. Most benches
+   // have nothing else in bindOnce; the few that bind descriptor sets too call
+   // cb->bindDescriptorSets() before/after this.
+   template<typename PC>
+   static void defaultBindAndPush(nbl::video::IGPUCommandBuffer* cb, const PipelineEntry& pe, const PC& pc)
+   {
+      cb->bindComputePipeline(pe.pipeline.get());
+      cb->pushConstants(pe.layout.get(), nbl::asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PC), &pc);
+   }
+
+   // Dispatch using m_dispatchGroupCount (the setup-time shape).
+   void defaultDispatch(nbl::video::IGPUCommandBuffer* cb) const
+   {
+      cb->dispatch(m_dispatchGroupCount.x, m_dispatchGroupCount.y, m_dispatchGroupCount.z);
+   }
+
+   bool init(const InitData& data)
+   {
+      m_device             = data.device;
+      m_logger             = data.logger;
+      m_physicalDevice     = data.physicalDevice;
+      m_queue              = m_device->getQueue(data.computeFamilyIndex, 0);
+      m_dispatchGroupCount = data.dispatchGroupCount;
+      m_samplesPerDispatch = data.samplesPerDispatch;
+
+      m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex,
+         nbl::video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+      if (!m_cmdpool->createCommandBuffers(nbl::video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf))
+      {
+         benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "GPUBenchmarkHelper: failed to create cmdbuf");
+         return false;
+      }
+
+      nbl::video::IQueryPool::SCreationParams qparams = {};
+      qparams.queryType                               = nbl::video::IQueryPool::TYPE::TIMESTAMP;
+      qparams.queryCount                              = 2;
+      qparams.pipelineStatisticsFlags                 = nbl::video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
+      m_queryPool                                     = m_device->createQueryPool(qparams);
+      if (!m_queryPool)
+      {
+         benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "GPUBenchmarkHelper: failed to create timestamp query pool");
+         return false;
+      }
+      return true;
+   }
+
+   // Load (precompiled path) or load+compile (runtime path) a variant's SPIRV.
+   nbl::core::smart_refctd_ptr<nbl::asset::IShader> loadShader(const ShaderVariant& variant, nbl::core::smart_refctd_ptr<nbl::asset::IAssetManager> assetMgr) const
+   {
+      using namespace nbl;
+      if (!variant.isRuntime() && !variant.isPrecompiled())
+      {
+         benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: variant has neither sourcePath nor precompiledKey");
+         return nullptr;
+      }
+
+      asset::IAssetLoader::SAssetLoadParams lp = {};
+      lp.logger                                = m_logger.get();
+
+      std::string key;
+      if (variant.isPrecompiled())
+      {
+         lp.workingDirectory = "app_resources";
+         key                 = variant.precompiledKey;
+      }
+      else
+      {
+         lp.workingDirectory = "";
+         key                 = "app_resources/" + variant.sourcePath;
+      }
+      auto       bundle = assetMgr->getAsset(key, lp);
+      const auto assets = bundle.getContents();
+      if (assets.empty())
+      {
+         benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: failed to load '{}'", key);
+         return nullptr;
+      }
+      auto source = asset::IAsset::castDown<asset::IShader>(assets[0]);
+      if (!source)
+      {
+         benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: '{}' is not an IShader asset", key);
+         return nullptr;
+      }
+
+      if (variant.isPrecompiled())
+         return source;
+
+      auto* compilerSet = assetMgr->getCompilerSet();
+      auto  compiler    = compilerSet->getShaderCompiler(source->getContentType());
+      if (!compiler)
+      {
+         benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: no compiler for content type of '{}'", variant.sourcePath);
+         return nullptr;
+      }
+
+      std::vector<asset::IShaderCompiler::SMacroDefinition> wireDefines;
+      wireDefines.reserve(variant.defines.size());
+      for (const auto& d : variant.defines)
+         wireDefines.push_back({d.identifier, d.definition});
+
+      asset::IShaderCompiler::SCompilerOptions options = {};
+      options.stage                                    = variant.stage;
+      options.preprocessorOptions.targetSpirvVersion   = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+      options.preprocessorOptions.sourceIdentifier     = source->getFilepathHint();
+      options.preprocessorOptions.logger               = m_logger.get();
+      options.preprocessorOptions.includeFinder        = compiler->getDefaultIncludeFinder();
+      options.preprocessorOptions.extraDefines         = {wireDefines.data(), wireDefines.size()};
+
+      auto spirv = compilerSet->compileToSPIRV(source.get(), options);
+      if (!spirv)
+         benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: runtime compile failed for '{}'", variant.sourcePath);
+      return spirv;
+   }
+
+   nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> allocateDeviceLocalBuffer(nbl::video::IGPUBuffer::SCreationParams bp, const char* label,
+      nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS allocFlags = nbl::video::IDeviceMemoryAllocation::EMAF_NONE)
+   {
+      auto buf  = m_device->createBuffer(std::move(bp));
+      auto reqs = buf->getMemoryReqs();
+      reqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+      auto alloc = m_device->allocate(reqs, buf.get(), allocFlags);
+      if (!alloc.isValid())
+         benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "GPUBenchmarkHelper: failed to allocate {}", label);
+      return buf;
+   }
+
+   struct SingleBindingDS
+   {
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSetLayout> layout;
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSet>       set;
+   };
+
+   SingleBindingDS createSingleBindingDS(
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> buffer,
+      nbl::asset::IDescriptor::E_TYPE                     type    = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+      uint32_t                                            binding = 0,
+      nbl::hlsl::ShaderStage                              stages  = nbl::hlsl::ShaderStage::ESS_COMPUTE)
+   {
+      using namespace nbl;
+      const size_t bufferBytes = buffer->getSize();
+
+      video::IGPUDescriptorSetLayout::SBinding b = {
+         .binding     = binding,
+         .type        = type,
+         .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+         .stageFlags  = stages,
+         .count       = 1,
+      };
+      SingleBindingDS out;
+      out.layout = m_device->createDescriptorSetLayout({&b, 1});
+      auto pool  = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, {&out.layout.get(), 1});
+      out.set    = pool->createDescriptorSet(core::smart_refctd_ptr(out.layout));
+
+      video::IGPUDescriptorSet::SDescriptorInfo info  = {};
+      info.desc                                       = std::move(buffer);
+      info.info.buffer                                = {.offset = 0, .size = bufferBytes};
+      video::IGPUDescriptorSet::SWriteDescriptorSet w = {
+         .dstSet       = out.set.get(),
+         .binding      = binding,
+         .arrayElement = 0,
+         .count        = 1,
+         .info         = &info,
+      };
+      m_device->updateDescriptorSets({&w, 1}, {});
+      return out;
+   }
+
+   nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> createOutputBuffer(
+      size_t                                                       bytes,
+      nbl::core::bitflag<nbl::video::IGPUBuffer::E_USAGE_FLAGS>    extraUsage = nbl::video::IGPUBuffer::E_USAGE_FLAGS::EUF_NONE,
+      nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS allocFlags = nbl::video::IDeviceMemoryAllocation::EMAF_NONE)
+   {
+      nbl::video::IGPUBuffer::SCreationParams bp = {};
+      bp.size                                    = bytes;
+      bp.usage                                   = nbl::core::bitflag(nbl::video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | extraUsage;
+      return allocateDeviceLocalBuffer(std::move(bp), "output buffer", allocFlags);
+   }
+
+   // Buffer must have been created with EUF_TRANSFER_DST_BIT.
+   void submitFillZero(nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> buf, size_t bytes) const
+   {
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer> initCmdbuf;
+      m_cmdpool->createCommandBuffers(nbl::video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &initCmdbuf);
+      initCmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+      const nbl::asset::SBufferRange<nbl::video::IGPUBuffer> range = {.offset = 0, .size = bytes, .buffer = std::move(buf)};
+      initCmdbuf->fillBuffer(range, 0u);
+      initCmdbuf->end();
+
+      const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = {{.cmdbuf = initCmdbuf.get()}};
+      nbl::video::IQueue::SSubmitInfo                           submit = {};
+      submit.commandBuffers                                            = cmds;
+      m_queue->submit({&submit, 1u});
+      m_device->waitIdle();
+   }
+
+   nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> createInputBufferZeroFilled(size_t bytes)
+   {
+      auto buf = createOutputBuffer(bytes, nbl::video::IGPUBuffer::EUF_TRANSFER_DST_BIT);
+      if (buf)
+         submitFillZero(buf, bytes);
+      return buf;
+   }
+
+   // BDA buffer staged into device-local VRAM via IUtilities.
+   nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> createBdaBuffer(const void* srcData, size_t bytes)
+   {
+      using namespace nbl;
+      if (!m_utils)
+         m_utils = video::IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr(m_logger));
+
+      video::IGPUBuffer::SCreationParams bp = {};
+      bp.size                               = bytes;
+      bp.usage                              = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | video::IGPUBuffer::EUF_TRANSFER_DST_BIT;
+      core::smart_refctd_ptr<video::IGPUBuffer> buf;
+      auto                                      future = m_utils->createFilledDeviceLocalBufferOnDedMem(
+         video::SIntendedSubmitInfo {.queue = m_queue}, std::move(bp), srcData);
+      future.move_into(buf);
+      return buf;
+   }
+
+   uint32_t createPipeline(const ShaderVariant&                        variant,
+      nbl::core::smart_refctd_ptr<nbl::asset::IAssetManager>           assetMgr,
+      size_t                                                           pushConstantSize,
+      std::string                                                      tag      = "",
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSetLayout> dsLayout = nullptr)
+   {
+      using namespace nbl;
+      const uint32_t idx = uint32_t(m_pipelines.size());
+      m_pipelines.push_back({.tag = tag});
+      PipelineEntry& slot = m_pipelines.back();
+
+      const asset::SPushConstantRange pcRange = {
+         .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
+         .offset     = 0,
+         .size       = uint32_t(pushConstantSize),
+      };
+      auto layout = dsLayout
+         ? m_device->createPipelineLayout({&pcRange, 1}, core::smart_refctd_ptr(dsLayout))
+         : m_device->createPipelineLayout({&pcRange, 1});
+      if (!layout)
+      {
+         benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): pipeline layout creation failed", tag);
+         return idx;
+      }
+
+      auto source = loadShader(variant, std::move(assetMgr));
+      auto shader = source ? m_device->compileShader({.source = source.get()}) : nullptr;
+      if (!shader)
+      {
+         benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): shader load/compile failed", tag);
+         return idx;
+      }
+
+      video::IGPUComputePipeline::SCreationParams pp = {};
+      pp.layout                                      = layout.get();
+      pp.shader.shader                               = shader.get();
+      pp.shader.entryPoint                           = "main";
+      if (m_device->getEnabledFeatures().pipelineExecutableInfo)
+         pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
+
+      core::smart_refctd_ptr<video::IGPUComputePipeline> pipeline;
+      if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline) || !pipeline)
+      {
+         benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): createComputePipelines failed", tag);
+         return idx;
+      }
+
+      if (m_device->getEnabledFeatures().pipelineExecutableInfo)
+      {
+         auto infos     = pipeline->getExecutableInfo();
+         slot.stats.raw = nbl::system::to_string(infos);
+
+         uint64_t vgpr = 0, sgpr = 0;
+         for (const auto& info : infos)
+         {
+            if (info.subgroupSize)
+               slot.stats.subgroupSize = std::max<uint32_t>(slot.stats.subgroupSize, info.subgroupSize);
+            for (const auto& stat : info.structuredStatistics)
+               matchStat(stat, slot.stats, vgpr, sgpr);
+         }
+         // AMD-style drivers expose VGPR/SGPR separately without a combined
+         // register count, so fall back to the sum.
+         if (slot.stats.registerCount == 0 && (vgpr || sgpr))
+            slot.stats.registerCount = vgpr + sgpr;
+
+         if (!slot.stats.raw.empty())
+            benchLogFmt(m_logger.get(), system::ILogger::ELL_PERFORMANCE, "{} pipeline executable report:\n{}", tag, slot.stats.raw);
+      }
+
+      slot.layout   = std::move(layout);
+      slot.pipeline = std::move(pipeline);
+      return idx;
+   }
+
+   Bindings createBindings(const BindingsConfig& cfg)
+   {
+      using namespace nbl;
+      Bindings out;
+
+      out.outputBuf     = createOutputBuffer(cfg.outputBytes, video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT, video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+      out.outputAddress = out.outputBuf->getDeviceAddress();
+
+      if (cfg.inputMode != InputBuffer::None && cfg.inputBytes > 0)
+      {
+         const bool useBDA  = cfg.inputMode == InputBuffer::BDA;
+         const bool useUBO  = cfg.inputMode == InputBuffer::UBO;
+         const bool useSSBO = cfg.inputMode == InputBuffer::SSBO;
+
+         video::IGPUBuffer::SCreationParams bp = {};
+         bp.size                               = cfg.inputBytes;
+         bp.usage                              = core::bitflag(video::IGPUBuffer::EUF_TRANSFER_DST_BIT);
+         if (useBDA || useSSBO)
+            bp.usage |= video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+         if (useBDA)
+            bp.usage |= video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+         if (useUBO)
+            bp.usage |= video::IGPUBuffer::EUF_UNIFORM_BUFFER_BIT;
+
+         out.inputBuf = allocateDeviceLocalBuffer(std::move(bp), "input buffer",
+            useBDA ? video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT : video::IDeviceMemoryAllocation::EMAF_NONE);
+
+         if (useBDA)
+            out.inputAddress = out.inputBuf->getDeviceAddress();
+
+         submitFillZero(out.inputBuf, cfg.inputBytes);
+
+         if (useSSBO || useUBO)
+         {
+            video::IGPUDescriptorSetLayout::SBinding b = {
+               .binding     = 0,
+               .type        = useSSBO ? asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER : asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER,
+               .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+               .stageFlags  = nbl::hlsl::ShaderStage::ESS_COMPUTE,
+               .count       = 1,
+            };
+            out.dsLayout = m_device->createDescriptorSetLayout({&b, 1});
+
+            auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, {&out.dsLayout.get(), 1});
+            out.ds    = pool->createDescriptorSet(core::smart_refctd_ptr(out.dsLayout));
+
+            video::IGPUDescriptorSet::SDescriptorInfo info  = {};
+            info.desc                                       = core::smart_refctd_ptr(out.inputBuf);
+            info.info.buffer                                = {.offset = 0, .size = cfg.inputBytes};
+            video::IGPUDescriptorSet::SWriteDescriptorSet w = {
+               .dstSet       = out.ds.get(),
+               .binding      = 0,
+               .arrayElement = 0,
+               .count        = 1,
+               .info         = &info,
+            };
+            m_device->updateDescriptorSets({&w, 1}, {});
+         }
+      }
+
+      {
+         const asset::SPushConstantRange pc = {
+            .stageFlags = nbl::hlsl::ShaderStage::ESS_COMPUTE,
+            .offset     = 0,
+            .size       = uint32_t(cfg.pushConstantBytes),
+         };
+         std::span<const asset::SPushConstantRange> pcRange = cfg.pushConstantBytes > 0 ? std::span<const asset::SPushConstantRange>(&pc, 1) : std::span<const asset::SPushConstantRange> {};
+
+         if (out.dsLayout)
+            out.pipelineLayout = m_device->createPipelineLayout(pcRange, core::smart_refctd_ptr(out.dsLayout));
+         else
+            out.pipelineLayout = m_device->createPipelineLayout(pcRange);
+      }
+
+      return out;
+   }
+
+   struct BdaBuffer
+   {
+      nbl::core::smart_refctd_ptr<nbl::video::IGPUBuffer> buf;
+      uint64_t                                            address = 0;
+   };
+
+   BdaBuffer createBdaOutputBuffer(size_t bytes)
+   {
+      BdaBuffer out;
+      out.buf     = createOutputBuffer(bytes, nbl::video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT, nbl::video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+      out.address = out.buf ? out.buf->getDeviceAddress() : 0;
+      return out;
+   }
+
+   // Auto-sizes the dispatch count so the measured window covers ~targetBudgetMs
+   // of GPU work. Pilots with a small N, then either scales to the budget or
+   // doubles when the pilot is too noisy (sub-millisecond) to extrapolate.
+   //
+   // `samples` controls jitter robustness: values >1 take K independent
+   // budget-sized timing windows and return the MEDIAN window, costing ~K *
+   // targetBudgetMs of wall time. Median (not min) is used because GPU
+   // measurement noise can be two-sided in practice. 
+   TimingResult runTimedBudgeted(uint32_t warmupDispatches, uint64_t targetBudgetMs, const DispatchFn& bindOnce, const DispatchFn& dispatchOne, uint32_t samples)
+   {
+      const uint64_t     targetBudgetNs = targetBudgetMs * 1'000'000ull;
+      constexpr uint32_t kPilotN        = 64;
+      constexpr uint32_t kMaxN          = 1u << 24; // safety cap for ultra-fast shaders
+      TimingResult       r              = runTimed(warmupDispatches, kPilotN, bindOnce, dispatchOne);
+      uint32_t           lastN          = kPilotN;
+      while (r.elapsed_ns < targetBudgetNs && lastN < kMaxN)
+      {
+         uint32_t nextN;
+         if (r.elapsed_ns > 1'000'000ull) // > 1 ms, stable enough to scale
+         {
+            const double scale = double(targetBudgetNs) / double(r.elapsed_ns);
+            nextN              = uint32_t(std::min<double>(double(kMaxN), std::ceil(double(lastN) * scale)));
+         }
+         else
+         {
+            nextN = std::min(kMaxN, lastN * 2);
+         }
+         if (nextN <= lastN)
+            break; // converged
+         r     = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne);
+         lastN = nextN;
+      }
+
+      if (samples <= 1)
+         return r;
+
+      // Reuse the convergence's final measurement as one of the K samples
+      // (it's already a budget-sized window at lastN). Run K-1 more at the
+      // same N. All windows measure the same dispatch count, so the per-window
+      // elapsed_ns values are directly comparable.
+      std::vector<double> ns;
+      ns.reserve(samples);
+      ns.push_back(r.elapsed_ns);
+      for (uint32_t i = 1; i < samples; ++i)
+      {
+         const TimingResult ri = runTimed(warmupDispatches, lastN, bindOnce, dispatchOne);
+         ns.push_back(ri.elapsed_ns);
+      }
+      std::sort(ns.begin(), ns.end());
+
+      // Outlier rejection: GPU jitter is usually a one-sided spike
+      const double median  = ns[ns.size() / 2];
+      const double dLow    = median - ns.front();
+      const double dHigh   = ns.back() - median;
+      const double dCloser = std::min(dLow, dHigh);
+      const double dFar    = std::max(dLow, dHigh);
+      size_t       lo      = 0;
+      size_t       hi      = ns.size();
+      if (dCloser > 0.0 && dFar > 2.0 * dCloser)
+      {
+         if (dHigh > dLow)
+            --hi; // top sample is the spike
+         else
+            ++lo; // bottom sample is the spike (rare on GPU but cheap to handle)
+      }
+
+      double sum = 0.0;
+      for (size_t i = lo; i < hi; ++i)
+         sum += ns[i];
+      const double resultNs = sum / double(hi - lo);
+
+      TimingResult m {};
+      m.elapsed_ns     = resultNs;
+      m.totalSamples   = uint64_t(lastN) * m_samplesPerDispatch;
+      m.ps_per_sample  = m.totalSamples ? resultNs * 1e3 / double(m.totalSamples) : 0.0;
+      m.gsamples_per_s = resultNs > 0.0 ? double(m.totalSamples) / resultNs : 0.0;
+      m.ms_total       = resultNs * 1e-6;
+      return m;
+   }
+
+   TimingResult runTimed(uint32_t warmupDispatches, uint32_t benchDispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne)
+   {
+      m_device->waitIdle();
+      const uint32_t cooldownDispatches = warmupDispatches;
+
+      m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE);
+      m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+      m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
+
+      if (bindOnce)
+         bindOnce(m_cmdbuf.get());
+
+      for (uint32_t i = 0u; i < warmupDispatches; ++i)
+         dispatchOne(m_cmdbuf.get());
+
+      m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
+      for (uint32_t i = 0u; i < benchDispatches; ++i)
+         dispatchOne(m_cmdbuf.get());
+      m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
+
+      for (uint32_t i = 0u; i < cooldownDispatches; ++i)
+         dispatchOne(m_cmdbuf.get());
+      m_cmdbuf->end();
+
+      auto                                                      semaphore   = m_device->createSemaphore(0u);
+      const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_cmdbuf.get()}};
+      const nbl::video::IQueue::SSubmitInfo::SSemaphoreInfo     signalSem[] = {
+         {.semaphore = semaphore.get(), .value = 1u, .stageMask = nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
+      nbl::video::IQueue::SSubmitInfo submit = {};
+      submit.commandBuffers                  = benchCmds;
+      submit.signalSemaphores                = signalSem;
+      m_queue->submit({&submit, 1u});
+
+      m_device->waitIdle();
+
+      uint64_t   timestamps[2] = {};
+      const auto flags         = nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::_64_BIT) | nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::WAIT_BIT);
+      m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags);
+
+      TimingResult r {};
+      const double timestampPeriod = double(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds);
+      r.elapsed_ns                 = double(timestamps[1] - timestamps[0]) * timestampPeriod;
+      r.totalSamples               = uint64_t(benchDispatches) * m_samplesPerDispatch;
+      r.ps_per_sample              = r.totalSamples ? r.elapsed_ns * 1e3 / double(r.totalSamples) : 0.0;
+      r.gsamples_per_s             = r.elapsed_ns > 0.0 ? double(r.totalSamples) / r.elapsed_ns : 0.0;
+      r.ms_total                   = r.elapsed_ns * 1e-6;
+      return r;
+   }
+
+protected:
+   std::vector<PipelineEntry> m_pipelines;
+
+private:
+   static void matchStat(const nbl::video::IGPUPipelineBase::SExecutableStatistic& stat, PipelineStats& out, uint64_t& vgpr, uint64_t& sgpr)
+   {
+      const uint64_t v = stat.asUint();
+
+      auto contains = [&](std::string_view kw)
+      {
+         const auto it = std::ranges::search(stat.name, kw,
+            [&](char a, char b)
+            { return std::tolower(a) == std::tolower(b); })
+                            .begin();
+         return it != stat.name.end();
+      };
+
+      // Order matters: more specific keys first.
+
+      if (contains("subgroup size") || contains("subgroupsize") || contains("warp size") || contains("wave size"))
+         out.subgroupSize = std::max<uint32_t>(out.subgroupSize, uint32_t(v));
+
+      else if (contains("vgpr"))
+         vgpr = std::max(vgpr, v);
+      else if (contains("sgpr"))
+         sgpr = std::max(sgpr, v);
+      else if (contains("register"))
+         out.registerCount = std::max(out.registerCount, v);
+
+      else if (contains("binary size") || contains("binarysize") || contains("codesize") || contains("code size") || contains("isa size"))
+         out.codeSizeBytes = std::max(out.codeSizeBytes, v);
+      else if (contains("instructioncount") || contains("instruction count") || contains("numinstructions"))
+         out.codeSizeBytes = std::max(out.codeSizeBytes, v); // proxy when no byte size
+
+      else if (contains("shared memory") || contains("sharedmemory") || contains("groupshared") || contains("lds"))
+         out.sharedMemBytes = std::max(out.sharedMemBytes, v);
+
+      else if (contains("stack size") || contains("stacksize"))
+         out.stackBytes = std::max(out.stackBytes, v);
+
+      else if (contains("local memory") || contains("localmemory") || contains("scratch") || contains("private memory") || contains("privatememory") || contains("stack"))
+         out.privateMemBytes = std::max(out.privateMemBytes, v);
+
+      // Vendor-specific stats
+      // get a structured copy so JSON round-trips the right numeric type.
+      else
+         out.unknowns.push_back(stat);
+   }
+
+   nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice>    m_device;
+   nbl::core::smart_refctd_ptr<nbl::system::ILogger>          m_logger;
+   nbl::video::IPhysicalDevice*                               m_physicalDevice = nullptr;
+   nbl::video::IQueue*                                        m_queue          = nullptr;
+   nbl::hlsl::uint32_t3                                       m_dispatchGroupCount {};
+   uint64_t                                                   m_samplesPerDispatch = 0;
+   nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandPool>   m_cmdpool;
+   nbl::core::smart_refctd_ptr<nbl::video::IGPUCommandBuffer> m_cmdbuf;
+   nbl::core::smart_refctd_ptr<nbl::video::IQueryPool>        m_queryPool;
+   nbl::core::smart_refctd_ptr<nbl::video::IUtilities>        m_utils; // lazy, only built on first createBdaBuffer call
+};
+
+#endif
diff --git a/common/include/nbl/examples/Benchmark/IBenchmark.h b/common/include/nbl/examples/Benchmark/IBenchmark.h
new file mode 100644
index 000000000..93493c2c6
--- /dev/null
+++ b/common/include/nbl/examples/Benchmark/IBenchmark.h
@@ -0,0 +1,409 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_COMMON_I_BENCHMARK_INCLUDED_
+#define _NBL_COMMON_I_BENCHMARK_INCLUDED_
+
+#include <nabla.h>
+#include "nbl/examples/Benchmark/BenchmarkTypes.h"
+#include "nbl/examples/Benchmark/BenchmarkConsole.h"
+#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h"
+#include "nbl/examples/Benchmark/BenchmarkJson.h"
+#include "nbl/examples/Benchmark/BenchmarkCli.h"
+#include "nlohmann/json.hpp"
+
+#include <algorithm>
+#include <concepts>
+#include <format>
+#include <ranges>
+#include <span>
+#include <string>
+#include <string_view>
+#include <vector>
+
+
+struct RunContext
+{
+   WorkloadShape shape;
+   uint64_t      targetBudgetMs = 400; // wall-clock budget per row
+   std::string   sectionLabel   = "Benchmarks";
+};
+
+// Typical use:
+//
+//   Aggregator agg(logger, logicalDevice, physicalDevice, computeFamilyIndex);
+//   agg.applyCli({.argv = argv, .defaultOutputPath = "Bench.json"});
+//   const RunContext myCtx{.shape = ..., .targetBudgetMs = 400, .sectionLabel = "..."};
+//   std::vector<MyBench> benches;
+//   for (...) benches.emplace_back(agg, MyBench::SetupData{...});
+//   MyOtherBench other(agg, MyOtherBench::SetupData{...});
+//   agg.runSessionAndReport(
+//      Aggregator::Span<MyBench>{std::span(benches), myCtx},
+//      Aggregator::Span<MyOtherBench>{std::span(&other, 1), otherCtx});
+class Aggregator
+{
+   friend class IBenchmark;
+
+public:
+   Aggregator() = default;
+
+   Aggregator(nbl::core::smart_refctd_ptr<nbl::system::ILogger> logger,
+      nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice>   logicalDevice,
+      nbl::video::IPhysicalDevice*                              physicalDevice,
+      uint32_t                                                  computeFamilyIndex)
+   {
+      m_console.setLogger(std::move(logger));
+      m_logicalDevice      = std::move(logicalDevice);
+      m_physicalDevicePtr  = physicalDevice;
+      m_computeFamilyIndex = computeFamilyIndex;
+      setDevice(physicalDevice);
+   }
+
+   void setSilent(bool silent) { m_console.setSilent(silent); }
+
+   const nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice>& getLogicalDevice() const { return m_logicalDevice; }
+   nbl::video::IPhysicalDevice*                                   getPhysicalDevice() const { return m_physicalDevicePtr; }
+   uint32_t                                                       getComputeFamilyIndex() const { return m_computeFamilyIndex; }
+   nbl::core::smart_refctd_ptr<nbl::system::ILogger>              getLogger() const
+   {
+      return nbl::core::smart_refctd_ptr<nbl::system::ILogger>(m_console.getLogger());
+   }
+
+   bool loadBaseline(std::string label, const std::string& path)
+   {
+      auto b = benchmark_json::loadBaselineFile(label, path);
+      if (!b)
+         return false;
+
+      for (const auto& [_, row] : b->rowsByName)
+         m_console.growForBaseline(row);
+
+      // Vector (not map) so delta columns print in load order.
+      auto it = std::find_if(m_baselines.begin(), m_baselines.end(),
+         [&](const Baseline& existing) { return existing.label == label; });
+      if (it != m_baselines.end())
+         *it = std::move(*b);
+      else
+         m_baselines.push_back(std::move(*b));
+      return true;
+   }
+
+   bool loadBaseline(const std::string& path) { return loadBaseline("baseline", path); }
+
+   bool writeReport(const std::string& path)
+   {
+      size_t preservedCount = 0;
+      if (!benchmark_json::writeReportFile(path, m_device, m_baselines, m_results, m_console.getLogger(), &preservedCount))
+         return false;
+
+      if (preservedCount > 0)
+         benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO,
+            "Wrote benchmark report to {} ({} new + {} preserved from prior file)",
+            path, m_results.size(), preservedCount);
+      else
+         benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO,
+            "Wrote benchmark report to {} ({} rows)", path, m_results.size());
+      return true;
+   }
+
+   // Captured for the UUID-mismatch warning in applyCli.
+   void setDevice(const nbl::video::IPhysicalDevice* dev) { m_device = benchmark_json::buildDeviceMetadata(dev); }
+
+   struct CliResult
+   {
+      std::string                                             outputPath;
+      nbl::core::vector<nbl::core::vector<nbl::core::string>> focusVariants;
+      uint32_t                                                focusSamples = 3; // --focus-samples, see samplesForCurrentRow
+
+      bool isFocused(const nbl::core::vector<nbl::core::string>& name) const
+      {
+         return std::ranges::find(focusVariants, name) != focusVariants.end();
+      }
+   };
+
+   template<typename T>
+   struct Span
+   {
+      std::span<T> benches;
+      RunContext   context;
+   };
+
+   // Two overloads so a single bench doesn't need `std::span<T>(&bench, 1)`.
+   template<typename Range>
+      requires requires (Range& r) { std::data(r); std::size(r); }
+   static auto makeSpan(Range& benches, RunContext context)
+   {
+      using T = std::remove_reference_t<decltype(*std::data(benches))>;
+      return Span<T>{std::span<T>(std::data(benches), std::size(benches)), std::move(context)};
+   }
+
+   template<typename T>
+      requires std::derived_from<T, IBenchmark>
+   static Span<T> makeSpan(T& bench, RunContext context)
+   {
+      return Span<T>{std::span<T>(&bench, 1), std::move(context)};
+   }
+
+   static std::string describe(const RunContext& ctx)
+   {
+      const auto&    sh             = ctx.shape;
+      const uint32_t wgThreads      = sh.workgroupSize.x * sh.workgroupSize.y * sh.workgroupSize.z;
+      const uint32_t threadsPerDisp = sh.dispatchGroupCount.x * sh.dispatchGroupCount.y * sh.dispatchGroupCount.z * wgThreads;
+      const uint64_t itersPerThread = threadsPerDisp ? sh.samplesPerDispatch / threadsPerDisp : 0;
+      const double   budgetMs       = double(ctx.targetBudgetMs);
+      return std::format("=== {} (~{:.0f}ms/row, {} threads/dispatch, {} iters/thread; wg={}x{}x{}; ps/sample is per all GPU threads) ===",
+         ctx.sectionLabel, budgetMs, threadsPerDisp, itersPerThread, sh.workgroupSize.x, sh.workgroupSize.y, sh.workgroupSize.z);
+   }
+
+   // Order: banner -> focus(spans...) -> comparison table -> banner ->
+   //        column header -> rest(spans...) -> writeReport.
+   // All focus rows print globally first, then all rest rows; banner printed
+   // twice so each chunk reads in isolation when scrolling back.
+   template<typename... Benches>
+      requires(std::derived_from<Benches, IBenchmark> && ...)
+   void runSessionAndReport(Span<Benches>... spans)
+   {
+      // Templated lambda (not `auto& s`) so only Span<T> deduces -- a future
+      // signature change can't silently start passing arbitrary types through.
+      auto runSpan = [this]<typename T>(Span<T>& s, bool silent)
+      {
+         if (s.benches.empty())
+            return;
+         if (!silent)
+         {
+            m_console.logSectionBanner(describe(s.context));
+            m_console.logHeader(m_baselines);
+         }
+         for (auto& e : s.benches)
+            e.run();
+         // Flush after each rest span: if span N+1 dies mid-way, span N's
+         // rows are already on disk. Trailing flush is also the final write.
+         if (!silent)
+            writeReport(m_cli.outputPath);
+      };
+
+      m_console.logBannerNotes(m_baselines);
+      if (!m_cli.focusVariants.empty())
+      {
+         m_console.setSilent(true); // benches read this to know they're in the focused-rows half
+         (runSpan(spans, true), ...);
+         m_console.setSilent(false);
+         m_console.printBaselineComparison(std::span<const nbl::core::vector<nbl::core::string>>(m_focusNames), m_baselines, m_results);
+      }
+      (runSpan(spans, false), ...);
+   }
+
+   struct CliConfig
+   {
+      std::span<const std::string> argv; // feed from IApplicationFramework::argv
+      std::string                  defaultOutputPath = "Bench.json";
+      std::string                  appName           = "benchmark";
+   };
+
+   CliResult applyCli(const CliConfig& cfg)
+   {
+      auto parsed = benchmark_cli::parseArgs(cfg.argv, cfg.defaultOutputPath);
+      if (parsed.helpRequested)
+      {
+         benchmark_cli::printHelp(m_console.getLogger(), cfg.appName, cfg.defaultOutputPath);
+         exit(0);
+      }
+      if (parsed.noColor)
+         m_console.setColorEnabled(false);
+
+      CliResult res;
+      res.outputPath = parsed.outputPath;
+
+      if (!parsed.baselines.empty())
+      {
+         size_t succeeded = 0;
+         for (const auto& [label, path] : parsed.baselines)
+         {
+            if (loadBaseline(label, path))
+            {
+               ++succeeded;
+               benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO,
+                  "Loaded baseline '{}' from {} ({} rows)", label, path, m_baselines.back().rowsByName.size());
+            }
+            else
+               benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING,
+                  "Failed to load baseline '{}' from {}, skipped", label, path);
+         }
+         if (succeeded == 0)
+            benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING,
+               "All {} --baseline load(s) failed. delta columns and --focus will be empty. "
+               "Check the paths above; default auto-load of '{}' is suppressed once any --baseline is specified, "
+               "drop the --baseline flag(s) or use --no-baseline to silence this warning.",
+               parsed.baselines.size(), res.outputPath);
+         else if (succeeded < parsed.baselines.size())
+            benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING,
+               "{} of {} --baseline load(s) failed; continuing with {} loaded.",
+               parsed.baselines.size() - succeeded, parsed.baselines.size(), succeeded);
+      }
+      else if (!parsed.noBaseline)
+      {
+         if (loadBaseline(res.outputPath))
+            benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO,
+               "Loaded baseline from {} ({} rows)", res.outputPath,
+               m_baselines.empty() ? size_t {0} : m_baselines.back().rowsByName.size());
+         else
+            benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO,
+               "No baseline at {}, delta column will read 'n/a'", res.outputPath);
+      }
+
+      warnDeviceMismatch();
+
+      res.focusVariants = std::move(parsed.focus);
+      res.focusSamples  = parsed.focusSamples;
+      m_cli             = res;
+      return res;
+   }
+
+private:
+   void warnDeviceMismatch() const
+   {
+      if (!m_device.is_object() || !m_device.contains("deviceUUID"))
+         return;
+      const auto& currentUUID = m_device["deviceUUID"];
+      for (const auto& b : m_baselines)
+      {
+         if (!b.device.is_object() || !b.device.contains("deviceUUID"))
+            continue;
+         if (b.device["deviceUUID"] == currentUUID)
+            continue;
+         const std::string baselineDevName = b.device.value("name", std::string {"<unknown>"});
+         const std::string currentDevName  = m_device.value("name", std::string {"<unknown>"});
+         benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING,
+            "Baseline '{}' (from {}) was measured on a different GPU ('{}' vs current '{}'). "
+            "Delta values will be apples-to-oranges.",
+            b.label, b.path, baselineDevName, currentDevName);
+      }
+   }
+
+   // In focus phase (silent), captures the row's name into m_focusNames so
+   // runSessionAndReport can build the comparison table without main.cpp
+   // threading names back through each bench class.
+   void appendAndLog(Result&& r)
+   {
+      const std::string joined = joinName(r.name);
+      if (!m_baselines.empty())
+      {
+         const std::string key = makeKey(r.name);
+         for (const auto& b : m_baselines)
+         {
+            auto it = b.rowsByName.find(key);
+            if (it == b.rowsByName.end())
+               continue;
+            const bool shapeMismatch = r.workload.present() && it->second.workload.present() && (r.workload.shape != it->second.workload.shape);
+            r.baselines[b.label] = {it->second.psPerSample, shapeMismatch};
+         }
+      }
+      m_console.growWidthFor(joined);
+      if (m_console.silent())
+         m_focusNames.push_back(r.name);
+      m_results.push_back(std::move(r));
+      m_console.logRow(std::span<const std::string>(m_results.back().name), joined, m_results.back().timing, m_results.back().stats, m_results.back().baselines, m_baselines);
+   }
+
+   std::vector<Result>                                     m_results;
+   std::vector<Baseline>                                   m_baselines;
+   nbl::core::vector<nbl::core::vector<nbl::core::string>> m_focusNames;
+   nlohmann::json                                          m_device;
+   CliResult                                               m_cli;
+   BenchmarkConsole                                        m_console;
+   nbl::core::smart_refctd_ptr<nbl::video::ILogicalDevice> m_logicalDevice;
+   nbl::video::IPhysicalDevice*                            m_physicalDevicePtr  = nullptr;
+   uint32_t                                                m_computeFamilyIndex = 0;
+};
+
+class IBenchmark
+{
+public:
+   virtual ~IBenchmark() = default;
+
+   // Single-named benches override doRun() and inherit this default filter.
+   // Sweep-style benches synthesize per-row names; they override run() and
+   // do per-row filtering themselves.
+   virtual void run()
+   {
+      const bool silent    = isFocusPhase();
+      const bool inFocus   = isFocused(m_name);
+      const bool shouldRun = silent ? inFocus : !inFocus;
+      if (shouldRun)
+         doRun();
+   }
+
+   uint32_t             getWarmupDispatches() const { return m_warmupDispatches; }
+   uint64_t             getTargetBudgetMs() const { return m_targetBudgetMs; }
+   const WorkloadShape& getShape() const { return m_workloadShape; }
+
+   // Pass this to runTimedBudgeted so only --focus rows pay the K * budget cost.
+   uint32_t samplesForCurrentRow() const { return isFocusPhase() ? m_aggregator.m_cli.focusSamples : 1u; }
+
+protected:
+   // Banner label is NOT taken here; it belongs to the span (see Aggregator::Span).
+   IBenchmark(Aggregator& aggregator, core::vector<core::string> name, uint32_t warmupDispatches, const WorkloadShape& shape, uint64_t targetBudgetMs)
+      : m_name(std::move(name))
+      , m_aggregator(aggregator)
+      , m_warmupDispatches(warmupDispatches)
+      , m_targetBudgetMs(targetBudgetMs)
+      , m_workloadShape(shape)
+   {
+      registerVariant(m_name);
+   }
+
+   virtual void doRun() {}
+
+   bool isFocusPhase() const { return m_aggregator.m_console.silent(); }
+   bool isFocused(const core::vector<core::string>& name) const { return m_aggregator.m_cli.isFocused(name); }
+   void registerVariant(std::span<const std::string> name) { m_aggregator.m_console.registerVariant(name); }
+   void registerVariant(std::initializer_list<std::string_view> name) { m_aggregator.m_console.registerVariant(name); }
+
+   void record(core::vector<core::string> name, const TimingResult& t, const PipelineStats& s)
+   {
+      Workload w{.shape = m_workloadShape};
+      w.benchDispatches = w.shape.samplesPerDispatch ? uint32_t(t.totalSamples / w.shape.samplesPerDispatch) : 0;
+
+      Result r;
+      r.name     = std::move(name);
+      r.timing   = t;
+      r.stats    = s;
+      r.workload = w;
+      m_aggregator.appendAndLog(std::move(r));
+   }
+
+   core::vector<core::string> m_name;
+   Aggregator&                m_aggregator; // non-owning, outlives this bench
+   uint32_t                   m_warmupDispatches;
+   uint64_t                   m_targetBudgetMs;
+   WorkloadShape              m_workloadShape;
+};
+
+class GPUBenchmark : public IBenchmark, public GPUBenchmarkHelper
+{
+public:
+   struct SetupData
+   {
+      core::vector<core::string> name;
+      uint32_t                   warmupDispatches = 0;
+      WorkloadShape              shape            = {};
+      uint64_t                   targetBudgetMs   = 400;
+   };
+
+protected:
+   GPUBenchmark(Aggregator& aggregator, const SetupData& data)
+      : IBenchmark(aggregator, data.name, data.warmupDispatches, data.shape, data.targetBudgetMs)
+   {
+      GPUBenchmarkHelper::init({
+         .device             = aggregator.getLogicalDevice(),
+         .logger             = aggregator.getLogger(),
+         .physicalDevice     = aggregator.getPhysicalDevice(),
+         .computeFamilyIndex = aggregator.getComputeFamilyIndex(),
+         .dispatchGroupCount = data.shape.dispatchGroupCount,
+         .samplesPerDispatch = data.shape.samplesPerDispatch,
+      });
+   }
+};
+
+#endif

From 429ed1965412d41027d9fa85ad31573887098a3d Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 13 May 2026 11:04:03 +0300
Subject: [PATCH 2/4] fixed a typo

---
 64_EmulatedFloatTest/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp
index 352e4e61f..11a6d2f55 100644
--- a/64_EmulatedFloatTest/main.cpp
+++ b/64_EmulatedFloatTest/main.cpp
@@ -99,9 +99,9 @@ class CEF64Benchmark : public GPUBenchmark
          [&](IGPUCommandBuffer* cb)
          {
             cb->bindDescriptorSets(EPBP_COMPUTE, pe.layout.get(), 0, 1, &m_ds.get());
-            bindAndPush(cb, pe, pc);
+            defaultBindAndPush(cb, pe, pc);
          },
-         [this](IGPUCommandBuffer* cb) { dispatch(cb); },
+         [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); },
          samplesForCurrentRow());
 
       record(m_name, t, pe.stats);

From 2a47606db6dcdcca3ff6f137a4dce8fda2f78edd Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 17 May 2026 12:49:36 +0200
Subject: [PATCH 3/4] Fix GPU benchmark submit chunking

---
 .../examples/Benchmark/GPUBenchmarkHelper.h   | 151 +++++++++++++-----
 1 file changed, 115 insertions(+), 36 deletions(-)

diff --git a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h
index b5bd69a13..3cd5d7a91 100644
--- a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h
+++ b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h
@@ -12,7 +12,9 @@
 #include "nbl/asset/utils/IShaderCompiler.h"
 
 #include <algorithm>
+#include <cmath>
 #include <functional>
+#include <limits>
 #include <ranges>
 #include <span>
 #include <string>
@@ -71,8 +73,8 @@ class GPUBenchmarkHelper
       bool isPrecompiled() const { return !precompiledKey.empty(); }
    };
 
-   // Layout: [bindOnce] [warmup x dispatchOne][ts0][bench x dispatchOne][ts1][cooldown x dispatchOne]
-   // Cooldown == warmup so the measured window isn't on a winding-down tail.
+   // Logical layout: [warmup x dispatchOne][ts0][bench x dispatchOne][ts1][cooldown x dispatchOne]
+   // Warmup/cooldown can be split into shorter submissions and the measured window stays intact.
    // Putting binds inside dispatchOne adds per-iteration cmdbuf overhead that
    // shows up in ps/sample on tight shaders.
    using DispatchFn = std::function<void(nbl::video::IGPUCommandBuffer*)>;
@@ -513,8 +515,22 @@ class GPUBenchmarkHelper
       const uint64_t     targetBudgetNs = targetBudgetMs * 1'000'000ull;
       constexpr uint32_t kPilotN        = 64;
       constexpr uint32_t kMaxN          = 1u << 24; // safety cap for ultra-fast shaders
-      TimingResult       r              = runTimed(warmupDispatches, kPilotN, bindOnce, dispatchOne);
+      uint32_t           dispatchesPerSubmit = 1u;
+      TimingResult       r                   = runTimed(warmupDispatches, kPilotN, bindOnce, dispatchOne, dispatchesPerSubmit);
+      dispatchesPerSubmit                    = estimateDispatchesPerSubmit(r, kPilotN);
       uint32_t           lastN          = kPilotN;
+      while (r.elapsed_ns > targetBudgetNs && lastN > 1u)
+      {
+         const double scale = double(targetBudgetNs) / r.elapsed_ns;
+         uint32_t     nextN = uint32_t(std::max(1.0, std::floor(double(lastN) * scale)));
+         if (nextN >= lastN)
+            nextN = lastN - 1u;
+
+         r                   = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne, dispatchesPerSubmit);
+         dispatchesPerSubmit = estimateDispatchesPerSubmit(r, nextN);
+         lastN               = nextN;
+      }
+
       while (r.elapsed_ns < targetBudgetNs && lastN < kMaxN)
       {
          uint32_t nextN;
@@ -529,8 +545,9 @@ class GPUBenchmarkHelper
          }
          if (nextN <= lastN)
             break; // converged
-         r     = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne);
-         lastN = nextN;
+         r                   = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne, dispatchesPerSubmit);
+         dispatchesPerSubmit = estimateDispatchesPerSubmit(r, nextN);
+         lastN               = nextN;
       }
 
       if (samples <= 1)
@@ -545,7 +562,7 @@ class GPUBenchmarkHelper
       ns.push_back(r.elapsed_ns);
       for (uint32_t i = 1; i < samples; ++i)
       {
-         const TimingResult ri = runTimed(warmupDispatches, lastN, bindOnce, dispatchOne);
+         const TimingResult ri = runTimed(warmupDispatches, lastN, bindOnce, dispatchOne, dispatchesPerSubmit);
          ns.push_back(ri.elapsed_ns);
       }
       std::sort(ns.begin(), ns.end());
@@ -580,48 +597,53 @@ class GPUBenchmarkHelper
       return m;
    }
 
-   TimingResult runTimed(uint32_t warmupDispatches, uint32_t benchDispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne)
+   TimingResult runTimed(uint32_t warmupDispatches, uint32_t benchDispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne, uint32_t maxDispatchesPerSubmit)
    {
-      m_device->waitIdle();
+      if (m_device->waitIdle() != nbl::video::IQueue::RESULT::SUCCESS)
+         return {};
+
       const uint32_t cooldownDispatches = warmupDispatches;
 
-      m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE);
-      m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-      m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
+      if (!runUntimedDispatches(warmupDispatches, bindOnce, dispatchOne, maxDispatchesPerSubmit))
+         return {};
 
-      if (bindOnce)
-         bindOnce(m_cmdbuf.get());
+      double   elapsedNs = 0.0;
+      uint32_t remaining = benchDispatches;
+      while (remaining > 0u)
+      {
+         const uint32_t batch = std::min(remaining, std::max(1u, maxDispatchesPerSubmit));
 
-      for (uint32_t i = 0u; i < warmupDispatches; ++i)
-         dispatchOne(m_cmdbuf.get());
+         m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE);
+         m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+         m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
 
-      m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
-      for (uint32_t i = 0u; i < benchDispatches; ++i)
-         dispatchOne(m_cmdbuf.get());
-      m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
+         if (bindOnce)
+            bindOnce(m_cmdbuf.get());
 
-      for (uint32_t i = 0u; i < cooldownDispatches; ++i)
-         dispatchOne(m_cmdbuf.get());
-      m_cmdbuf->end();
+         m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
+         for (uint32_t i = 0u; i < batch; ++i)
+            dispatchOne(m_cmdbuf.get());
+         m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
+         m_cmdbuf->end();
 
-      auto                                                      semaphore   = m_device->createSemaphore(0u);
-      const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_cmdbuf.get()}};
-      const nbl::video::IQueue::SSubmitInfo::SSemaphoreInfo     signalSem[] = {
-         {.semaphore = semaphore.get(), .value = 1u, .stageMask = nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
-      nbl::video::IQueue::SSubmitInfo submit = {};
-      submit.commandBuffers                  = benchCmds;
-      submit.signalSemaphores                = signalSem;
-      m_queue->submit({&submit, 1u});
+         if (!submitAndWait())
+            return {};
 
-      m_device->waitIdle();
+         uint64_t   timestamps[2] = {};
+         const auto flags         = nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::_64_BIT) | nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::WAIT_BIT);
+         if (!m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags))
+            return {};
+
+         const double timestampPeriod = double(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds);
+         elapsedNs += double(timestamps[1] - timestamps[0]) * timestampPeriod;
+         remaining -= batch;
+      }
 
-      uint64_t   timestamps[2] = {};
-      const auto flags         = nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::_64_BIT) | nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::WAIT_BIT);
-      m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags);
+      if (!runUntimedDispatches(cooldownDispatches, bindOnce, dispatchOne, maxDispatchesPerSubmit))
+         return {};
 
       TimingResult r {};
-      const double timestampPeriod = double(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds);
-      r.elapsed_ns                 = double(timestamps[1] - timestamps[0]) * timestampPeriod;
+      r.elapsed_ns                 = elapsedNs;
       r.totalSamples               = uint64_t(benchDispatches) * m_samplesPerDispatch;
       r.ps_per_sample              = r.totalSamples ? r.elapsed_ns * 1e3 / double(r.totalSamples) : 0.0;
       r.gsamples_per_s             = r.elapsed_ns > 0.0 ? double(r.totalSamples) / r.elapsed_ns : 0.0;
@@ -633,6 +655,63 @@ class GPUBenchmarkHelper
    std::vector<PipelineEntry> m_pipelines;
 
 private:
+   // Soft target for one queue submit, estimated from timings on the current GPU.
+   // Benchmark budgets still control measured work. This only chunks submits.
+   static constexpr double SubmitChunkTargetNs = 250'000'000.0;
+
+   static uint32_t estimateDispatchesPerSubmit(const TimingResult& r, uint32_t dispatches)
+   {
+      if (dispatches == 0u || r.elapsed_ns <= 0.0)
+         return 1u;
+
+      const double nsPerDispatch = r.elapsed_ns / double(dispatches);
+      if (nsPerDispatch <= 0.0)
+         return 1u;
+
+      const double maxDispatches = std::floor(SubmitChunkTargetNs / nsPerDispatch);
+      return uint32_t(std::clamp(maxDispatches, 1.0, double(std::numeric_limits<uint32_t>::max())));
+   }
+
+   bool submitAndWait()
+   {
+      auto semaphore = m_device->createSemaphore(0u);
+      if (!semaphore)
+         return false;
+
+      const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = {{.cmdbuf = m_cmdbuf.get()}};
+      const nbl::video::IQueue::SSubmitInfo::SSemaphoreInfo     done[] = {
+         {.semaphore = semaphore.get(), .value = 1u, .stageMask = nbl::asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS}};
+      nbl::video::IQueue::SSubmitInfo submit = {};
+      submit.commandBuffers                  = cmds;
+      submit.signalSemaphores                = done;
+      if (m_queue->submit({&submit, 1u}) != nbl::video::IQueue::RESULT::SUCCESS)
+         return false;
+
+      const nbl::video::ISemaphore::SWaitInfo wait[] = {{.semaphore = semaphore.get(), .value = 1u}};
+      return m_device->blockForSemaphores(wait) == nbl::video::ISemaphore::WAIT_RESULT::SUCCESS;
+   }
+
+   bool runUntimedDispatches(uint32_t dispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne, uint32_t maxDispatchesPerSubmit)
+   {
+      while (dispatches > 0u)
+      {
+         const uint32_t batch = std::min(dispatches, std::max(1u, maxDispatchesPerSubmit));
+
+         m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE);
+         m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+         if (bindOnce)
+            bindOnce(m_cmdbuf.get());
+         for (uint32_t i = 0u; i < batch; ++i)
+            dispatchOne(m_cmdbuf.get());
+         m_cmdbuf->end();
+
+         if (!submitAndWait())
+            return false;
+         dispatches -= batch;
+      }
+      return true;
+   }
+
    static void matchStat(const nbl::video::IGPUPipelineBase::SExecutableStatistic& stat, PipelineStats& out, uint64_t& vgpr, uint64_t& sgpr)
    {
       const uint64_t v = stat.asUint();

From f8c9b02fcd45727a864290cf993a07144b87d1e3 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 18 May 2026 10:45:22 +0200
Subject: [PATCH 4/4] Fix benchmark sync and failed reruns

---
 .../benchmarks/CDiscreteSamplerBenchmark.h    |  10 +-
 .../benchmarks/CSamplerBenchmark.h            |   8 +-
 37_HLSLSamplingTests/main.cpp                 | 133 +++++--
 .../tests/SamplerTestHelpers.h                |   4 +-
 .../tests/property/CSamplerPropertyTester.h   |  20 +-
 64_EmulatedFloatTest/main.cpp                 |  12 +-
 .../nbl/examples/Benchmark/BenchmarkJson.h    |  49 ++-
 .../examples/Benchmark/GPUBenchmarkHelper.h   |  24 +-
 .../nbl/examples/Tester/FailureManifest.h     | 331 ++++++++++++++++++
 common/include/nbl/examples/Tester/ITester.h  |  32 +-
 10 files changed, 551 insertions(+), 72 deletions(-)
 create mode 100644 common/include/nbl/examples/Tester/FailureManifest.h

diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
index d6289f54b..f12ba9421 100644
--- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
+++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
@@ -198,7 +198,9 @@ class CDiscreteSamplerBenchmark : public GPUBenchmark
       // Pipeline + push constants are bound *once* in bindOnce, the inner loop is just
       // dispatch(...). Putting binds inside dispatchOne would inflate ps/sample on the
       // tighter samplers.
-      const PipelineEntry& pe = m_pipelines[m_pipelineIdx[size_t(kind)]];
+      const PipelineEntry* pe = getPipelineEntry(m_pipelineIdx[size_t(kind)], joinName(name));
+      if (!pe)
+         return;
 
       const TimingResult timingResult = runTimedBudgeted(warmupIterations, getTargetBudgetMs(),
          [&](IGPUCommandBuffer* cb)
@@ -210,7 +212,7 @@ class CDiscreteSamplerBenchmark : public GPUBenchmark
                pc.pdfAddress                 = m_aliasPdfBuf->getDeviceAddress();
                pc.outputAddress              = m_outputBuf->getDeviceAddress();
                pc.tableSize                  = m_aliasTableN;
-               defaultBindAndPush(cb, pe, pc);
+               defaultBindAndPush(cb, *pe, pc);
             }
             else
             {
@@ -219,13 +221,13 @@ class CDiscreteSamplerBenchmark : public GPUBenchmark
                pc.cumProbAddress        = buf->getDeviceAddress();
                pc.outputAddress         = m_outputBuf->getDeviceAddress();
                pc.tableSize             = N;
-               defaultBindAndPush(cb, pe, pc);
+               defaultBindAndPush(cb, *pe, pc);
             }
          },
          [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); },
          samplesForCurrentRow());
 
-      record(std::move(name), timingResult, pe.stats);
+      record(std::move(name), timingResult, pe->stats);
    }
 
    core::smart_refctd_ptr<IAssetManager> m_assetMgr;
diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
index dd7149829..7410b7242 100644
--- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
+++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
@@ -39,16 +39,18 @@ class CSamplerBenchmark : public GPUBenchmark
 
    void doRun() override
    {
-      const PipelineEntry&      pe = m_pipelines[m_pipelineIdx];
+      const PipelineEntry*      pe = getPipelineEntry(m_pipelineIdx, joinName(m_name));
+      if (!pe)
+         return;
       SamplerBenchPushConstants pc = {};
       pc.outputAddress             = m_outputAddress;
 
       const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(),
-         [&](video::IGPUCommandBuffer* cb) { defaultBindAndPush(cb, pe, pc); },
+         [&](video::IGPUCommandBuffer* cb) { defaultBindAndPush(cb, *pe, pc); },
          [this](video::IGPUCommandBuffer* cb) { defaultDispatch(cb); },
          samplesForCurrentRow());
 
-      record(m_name, t, pe.stats);
+      record(m_name, t, pe->stats);
    }
 
    private:
diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp
index 634e84123..1c3f6000d 100644
--- a/37_HLSLSamplingTests/main.cpp
+++ b/37_HLSLSamplingTests/main.cpp
@@ -52,6 +52,7 @@ using namespace nbl::examples;
 
 #include "benchmarks/CSamplerBenchmark.h"
 #include "benchmarks/CDiscreteSamplerBenchmark.h"
+#include "nbl/examples/Tester/FailureManifest.h"
 #include "tests/property/CSamplerPropertyTester.h"
 
 
@@ -189,6 +190,12 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
 
       m_logger->log("All sampling concept tests passed.", ILogger::ELL_INFO);
 
+      const auto runControl = nbl::examples::testing::parseRunControl(this->argv, m_logger.get());
+      if (!runControl.valid)
+         return false;
+
+      nbl::examples::testing::FailureManifest failureManifest("37_HLSLSamplingTests");
+
       // ======================================================================
       // GPU throughput benchmarks
       // ======================================================================
@@ -197,6 +204,12 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
 
       if constexpr (DoBenchmark)
       {
+         if (runControl.skipBenchmarks)
+         {
+            m_logger->log("Skipping benchmark phase due to CLI.", ILogger::ELL_INFO);
+         }
+         else
+         {
          constexpr uint32_t benchWorkgroupSize      = WORKGROUP_SIZE;
          constexpr uint32_t totalThreadsPerDispatch = benchWorkgroupsCount * benchWorkgroupSize;
          constexpr uint32_t iterationsPerThread     = BENCH_ITERS;
@@ -346,6 +359,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
                Aggregator::makeSpan(benchmarks,    samplerCtx),
                Aggregator::makeSpan(discreteBench, discreteCtx));
          }
+         }
       }
 
       // ================================================================
@@ -353,9 +367,16 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // ================================================================
       bool pass = true;
       constexpr uint32_t testWorkgroupsCount = 4096;
+      bool samplerPass = true;
       // generic lambda to run a GPU sampler test
-      auto runSamplerTest = [&]<typename Tester, core::StringLiteral ShaderKey>(const char* testName, const char* logFile)
+      auto runSamplerTest = [&]<typename Tester, core::StringLiteral ShaderKey>(const char* id, const char* testName, const char* logFile)
       {
+         if (!runControl.filter.shouldRun(id))
+         {
+            m_logger->log("Skipping %s tests due to filter.", ILogger::ELL_INFO, testName);
+            return;
+         }
+
          m_logger->log("Running %s tests...", ILogger::ELL_INFO, testName);
          typename Tester::PipelineSetupData data;
          data.device             = m_device;
@@ -367,43 +388,58 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          data.shaderKey          = std::move(nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_device.get()));
          Tester tester(testWorkgroupsCount);
          tester.setupPipeline(data);
-         pass &= tester.performTestsAndVerifyResults(logFile);
+         if (const auto seed = runControl.filter.seedFor(id); seed.has_value())
+            tester.setSeed(*seed);
+         tester.setFailureRecordContext(&failureManifest, "sampler", id, testName);
+         samplerPass &= tester.performTestsAndVerifyResults(logFile);
       };
 
       // --- Sampler tests ---
       if constexpr (true)
       {
-         runSamplerTest.operator()<CLinearTester, "linear_test">("Linear sampler", "LinearTestLog.txt");
-         runSamplerTest.operator()<CBilinearTester, "bilinear_test">("Bilinear sampler", "BilinearTestLog.txt");
-         runSamplerTest.operator()<CUniformHemisphereTester, "uniform_hemisphere_test">("UniformHemisphere sampler", "UniformHemisphereTestLog.txt");
-         runSamplerTest.operator()<CUniformSphereTester, "uniform_sphere_test">("UniformSphere sampler", "UniformSphereTestLog.txt");
-         runSamplerTest.operator()<CProjectedHemisphereTester, "projected_hemisphere_test">("ProjectedHemisphere sampler", "ProjectedHemisphereTestLog.txt");
-         runSamplerTest.operator()<CProjectedSphereTester, "projected_sphere_test">("ProjectedSphere sampler", "ProjectedSphereTestLog.txt");
-         runSamplerTest.operator()<CConcentricMappingTester, "concentric_mapping_test">("ConcentricMapping sampler", "ConcentricMappingTestLog.txt");
-         runSamplerTest.operator()<CPolarMappingTester, "polar_mapping_test">("PolarMapping sampler", "PolarMappingTestLog.txt");
-         runSamplerTest.operator()<CBoxMullerTransformTester, "box_muller_transform_test">("BoxMullerTransform sampler", "BoxMullerTransformTestLog.txt");
-         runSamplerTest.operator()<CSphericalTriangleTester, "spherical_triangle">("SphericalTriangle", "SphericalTriangleTestLog.txt");
-         runSamplerTest.operator()<CProjectedSphericalTriangleTester, "projected_spherical_triangle_test">("ProjectedSphericalTriangle sampler", "ProjectedSphericalTriangleTestLog.txt");
-         runSamplerTest.operator()<CSphericalRectangleTester, "spherical_rectangle_test">("SphericalRectangle sampler", "SphericalRectangleTestLog.txt");
-         runSamplerTest.operator()<CProjectedSphericalRectangleTester, "projected_spherical_rectangle_test">("ProjectedSphericalRectangle sampler", "ProjectedSphericalRectangleTestLog.txt");
+         runSamplerTest.operator()<CLinearTester, "linear_test">("sampler/Linear", "Linear sampler", "LinearTestLog.txt");
+         runSamplerTest.operator()<CBilinearTester, "bilinear_test">("sampler/Bilinear", "Bilinear sampler", "BilinearTestLog.txt");
+         runSamplerTest.operator()<CUniformHemisphereTester, "uniform_hemisphere_test">("sampler/UniformHemisphere", "UniformHemisphere sampler", "UniformHemisphereTestLog.txt");
+         runSamplerTest.operator()<CUniformSphereTester, "uniform_sphere_test">("sampler/UniformSphere", "UniformSphere sampler", "UniformSphereTestLog.txt");
+         runSamplerTest.operator()<CProjectedHemisphereTester, "projected_hemisphere_test">("sampler/ProjectedHemisphere", "ProjectedHemisphere sampler", "ProjectedHemisphereTestLog.txt");
+         runSamplerTest.operator()<CProjectedSphereTester, "projected_sphere_test">("sampler/ProjectedSphere", "ProjectedSphere sampler", "ProjectedSphereTestLog.txt");
+         runSamplerTest.operator()<CConcentricMappingTester, "concentric_mapping_test">("sampler/ConcentricMapping", "ConcentricMapping sampler", "ConcentricMappingTestLog.txt");
+         runSamplerTest.operator()<CPolarMappingTester, "polar_mapping_test">("sampler/PolarMapping", "PolarMapping sampler", "PolarMappingTestLog.txt");
+         runSamplerTest.operator()<CBoxMullerTransformTester, "box_muller_transform_test">("sampler/BoxMullerTransform", "BoxMullerTransform sampler", "BoxMullerTransformTestLog.txt");
+         runSamplerTest.operator()<CSphericalTriangleTester, "spherical_triangle">("sampler/SphericalTriangle", "SphericalTriangle", "SphericalTriangleTestLog.txt");
+         runSamplerTest.operator()<CProjectedSphericalTriangleTester, "projected_spherical_triangle_test">("sampler/ProjectedSphericalTriangle", "ProjectedSphericalTriangle sampler", "ProjectedSphericalTriangleTestLog.txt");
+         runSamplerTest.operator()<CSphericalRectangleTester, "spherical_rectangle_test">("sampler/SphericalRectangle", "SphericalRectangle sampler", "SphericalRectangleTestLog.txt");
+         runSamplerTest.operator()<CProjectedSphericalRectangleTester, "projected_spherical_rectangle_test">("sampler/ProjectedSphericalRectangle", "ProjectedSphericalRectangle sampler", "ProjectedSphericalRectangleTestLog.txt");
       }
 
       if constexpr (true)
       {
          // --- Discrete table construction (CPU) ---
          {
-            m_logger->log("Running discrete table builder tests (CPU)...", ILogger::ELL_INFO);
-            CDiscreteTableTester tableTester(m_logger.get());
-            pass &= tableTester.run();
+            constexpr const char* id = "sampler/DiscreteTableBuilder";
+            if (!runControl.filter.shouldRun(id))
+            {
+               m_logger->log("Skipping discrete table builder tests due to filter.", ILogger::ELL_INFO);
+            }
+            else
+            {
+               m_logger->log("Running discrete table builder tests (CPU)...", ILogger::ELL_INFO);
+               CDiscreteTableTester tableTester(m_logger.get());
+               const bool ok = tableTester.run();
+               samplerPass &= ok;
+               if (!ok)
+                  failureManifest.addGroupFailure("sampler", id, "Discrete table builder");
+            }
          }
 
          // --- GPU table sampler tests ---
-         runSamplerTest.operator()<CPackedAliasAGPUTester, "packed_alias_a_test">("PackedAliasA GPU sampler", "PackedAliasATestLog.txt");
-         runSamplerTest.operator()<CPackedAliasBGPUTester, "packed_alias_b_test">("PackedAliasB GPU sampler", "PackedAliasBTestLog.txt");
-         runSamplerTest.operator()<CCumulativeProbabilityGPUTester, "cumulative_probability_test">("CumulativeProbability GPU sampler", "CumulativeProbabilityTestLog.txt");
+         runSamplerTest.operator()<CPackedAliasAGPUTester, "packed_alias_a_test">("sampler/PackedAliasA", "PackedAliasA GPU sampler", "PackedAliasATestLog.txt");
+         runSamplerTest.operator()<CPackedAliasBGPUTester, "packed_alias_b_test">("sampler/PackedAliasB", "PackedAliasB GPU sampler", "PackedAliasBTestLog.txt");
+         runSamplerTest.operator()<CCumulativeProbabilityGPUTester, "cumulative_probability_test">("sampler/CumulativeProbability", "CumulativeProbability GPU sampler", "CumulativeProbabilityTestLog.txt");
       }
       logJacobianSkipCounts(m_logger.get());
-      if (pass)
+      pass &= samplerPass;
+      if (samplerPass)
          m_logger->log("All sampling tests PASSED.", ILogger::ELL_INFO);
       else
          m_logger->log("Some sampling tests FAILED. Check log files for details.", ILogger::ELL_ERROR);
@@ -413,12 +449,28 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // ================================================================
       if constexpr (true)
       {
+         bool propertyPass = true;
          m_logger->log("Running sampler property tests (CPU)...", ILogger::ELL_INFO);
          m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING);
 
          auto check = [&]<typename Config>()
          {
-            pass &= CSamplerPropertyTester<Config>(m_logger.get()).run();
+            const std::string id = std::string("property/") + Config::name();
+            if (!runControl.filter.shouldRun(id))
+            {
+               m_logger->log("Skipping %s property tests due to filter.", ILogger::ELL_INFO, Config::name());
+               return;
+            }
+
+            CSamplerPropertyTester<Config> tester(m_logger.get(), runControl.filter.seedFor(id));
+            const bool ok = tester.run();
+            propertyPass &= ok;
+            if (!ok)
+            {
+               failureManifest.addGroupFailure("property", id, Config::name());
+               if (const auto seed = tester.failureSeed(); seed.has_value())
+                  failureManifest.addCase("property", id, Config::name(), "property", "CPU", 0, *seed, 0.0, 0.0);
+            }
          };
 
          check.operator()<LinearPropertyConfig>();
@@ -444,7 +496,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          // Grazing angle tests
          check.operator()<ProjectedSphericalTriangleGrazingConfig>();
 
-         if (pass)
+         pass &= propertyPass;
+         if (propertyPass)
             m_logger->log("All sampler property tests PASSED.", ILogger::ELL_INFO);
          else
             m_logger->log("Some sampler property tests FAILED.", ILogger::ELL_ERROR);
@@ -455,27 +508,41 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // ================================================================
       if constexpr (true)
       {
+         bool geometryPass = true;
          m_logger->log("Running geometry tests (CPU)...", ILogger::ELL_INFO);
          m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING);
 
-         auto check = [&]<typename Tester>()
+         auto check = [&]<typename Tester>(const char* id, const char* name)
          {
-            pass &= Tester(m_logger.get()).run();
+            if (!runControl.filter.shouldRun(id))
+            {
+               m_logger->log("Skipping %s geometry tests due to filter.", ILogger::ELL_INFO, name);
+               return;
+            }
+
+            const bool ok = Tester(m_logger.get()).run();
+            geometryPass &= ok;
+            if (!ok)
+               failureManifest.addGroupFailure("geometry", id, name);
          };
 
-         check.template operator()<CSolidAngleAccuracyTester>();
-         check.template operator()<CSphericalTriangleGenerateTester>();
-         check.template operator()<CSphericalRectangleGenerateTester>();
-         check.template operator()<CProjectedSphericalRectangleGenerateTester>();
-         check.template operator()<CProjectedSphericalRectangleGeometricTester>();
-         check.template operator()<CProjectedSphericalTriangleGeometricTester>();
+         check.template operator()<CSolidAngleAccuracyTester>("geometry/SolidAngleAccuracy", "SolidAngleAccuracy");
+         check.template operator()<CSphericalTriangleGenerateTester>("geometry/SphericalTriangleGenerate", "SphericalTriangleGenerate");
+         check.template operator()<CSphericalRectangleGenerateTester>("geometry/SphericalRectangleGenerate", "SphericalRectangleGenerate");
+         check.template operator()<CProjectedSphericalRectangleGenerateTester>("geometry/ProjectedSphericalRectangleGenerate", "ProjectedSphericalRectangleGenerate");
+         check.template operator()<CProjectedSphericalRectangleGeometricTester>("geometry/ProjectedSphericalRectangle", "ProjectedSphericalRectangle");
+         check.template operator()<CProjectedSphericalTriangleGeometricTester>("geometry/ProjectedSphericalTriangle", "ProjectedSphericalTriangle");
 
-         if (pass)
+         pass &= geometryPass;
+         if (geometryPass)
             m_logger->log("All geometry tests PASSED.", ILogger::ELL_INFO);
          else
             m_logger->log("Some geometry tests FAILED.", ILogger::ELL_ERROR);
       }
 
+      if (!runControl.failedOutPath.empty())
+         pass &= nbl::examples::testing::writeFailureManifestFile(failureManifest, runControl.failedOutPath, m_logger.get());
+
       return pass;
    }
 
diff --git a/37_HLSLSamplingTests/tests/SamplerTestHelpers.h b/37_HLSLSamplingTests/tests/SamplerTestHelpers.h
index 44dd5f961..1246ebc08 100644
--- a/37_HLSLSamplingTests/tests/SamplerTestHelpers.h
+++ b/37_HLSLSamplingTests/tests/SamplerTestHelpers.h
@@ -7,6 +7,8 @@
 #include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
 #include <nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl>
 
+#include <optional>
+
 // ============================================================================
 // Declarative field verification helpers
 //
@@ -475,7 +477,7 @@ struct SeededTestContext
    std::mt19937 rng;
    uint32_t failCount = 0;
 
-   SeededTestContext() : seed(std::random_device {}()), rng(seed) {}
+   SeededTestContext(std::optional<uint32_t> seedOverride = {}) : seed(seedOverride.value_or(std::random_device {}())), rng(seed) {}
 
    // Log "reproduce with seed" if failCount > 0, return failCount == 0
    bool finalize(nbl::system::ILogger* logger, const char* tag) const
diff --git a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h
index ecb0f606d..b20ba88f9 100644
--- a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h
+++ b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h
@@ -78,7 +78,9 @@ class CSamplerPropertyTester
    }
 
    public:
-   CSamplerPropertyTester(system::ILogger* logger) : m_logger(logger) {}
+   CSamplerPropertyTester(system::ILogger* logger, std::optional<uint32_t> seedOverride = {}) : m_logger(logger), m_seedOverride(seedOverride) {}
+
+   std::optional<uint32_t> failureSeed() const { return m_failureSeed; }
 
    bool run()
    {
@@ -96,7 +98,7 @@ class CSamplerPropertyTester
    // If the PDF normalization is wrong by factor k, this will be off by 1/k.
    bool testMonteCarloPdfNormalization()
    {
-      SeededTestContext ctx;
+      SeededTestContext ctx(m_seedOverride);
       uint32_t evaluatedConfigs = 0;
 
       for (uint32_t c = 0; c < Config::numConfigurations; c++)
@@ -159,7 +161,10 @@ class CSamplerPropertyTester
          m_logger->log("  [%s] MC normalization FAILED (%u/%u evaluated configs failed, %u/%u configs evaluated, %u samples/config, relTol=%e)",
             system::ILogger::ELL_ERROR, Config::name(), ctx.failCount, evaluatedConfigs, evaluatedConfigs, Config::numConfigurations, Config::samplesPerConfig, Config::mcNormalizationRelTol);
 
-      return ctx.finalize(m_logger, Config::name());
+      const bool passed = ctx.finalize(m_logger, Config::name());
+      if (!passed)
+         m_failureSeed = ctx.seed;
+      return passed;
    }
 
    // Test 4: Grid integration of backwardPdf over [0,1]^d codomain
@@ -167,7 +172,7 @@ class CSamplerPropertyTester
    // integral of backwardPdf over codomain should equal 1.0.
    bool testGridPdfNormalization()
    {
-      SeededTestContext ctx;
+      SeededTestContext ctx(m_seedOverride);
 
       for (uint32_t c = 0; c < Config::numConfigurations; c++)
       {
@@ -191,10 +196,15 @@ class CSamplerPropertyTester
          m_logger->log("  [%s] grid PDF normalization FAILED (%u/%u configs exceeded absTol=%e)",
             system::ILogger::ELL_ERROR, Config::name(), ctx.failCount, Config::numConfigurations, Config::gridNormalizationAbsTol);
 
-      return ctx.finalize(m_logger, Config::name());
+      const bool passed = ctx.finalize(m_logger, Config::name());
+      if (!passed)
+         m_failureSeed = ctx.seed;
+      return passed;
    }
 
    system::ILogger* m_logger;
+   std::optional<uint32_t> m_seedOverride;
+   std::optional<uint32_t> m_failureSeed;
 };
 
 
diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp
index 11a6d2f55..549596bac 100644
--- a/64_EmulatedFloatTest/main.cpp
+++ b/64_EmulatedFloatTest/main.cpp
@@ -91,20 +91,22 @@ class CEF64Benchmark : public GPUBenchmark
 
    void doRun() override
    {
-      const PipelineEntry&   pe = m_pipelines[m_pipelineIdx];
+      const PipelineEntry*   pe = getPipelineEntry(m_pipelineIdx, joinName(m_name));
+      if (!pe)
+         return;
       BenchmarkPushConstants pc = {};
       pc.benchmarkMode          = m_mode;
 
       const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(),
          [&](IGPUCommandBuffer* cb)
          {
-            cb->bindDescriptorSets(EPBP_COMPUTE, pe.layout.get(), 0, 1, &m_ds.get());
-            defaultBindAndPush(cb, pe, pc);
+            cb->bindDescriptorSets(EPBP_COMPUTE, pe->layout.get(), 0, 1, &m_ds.get());
+            defaultBindAndPush(cb, *pe, pc);
          },
          [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); },
          samplesForCurrentRow());
 
-      record(m_name, t, pe.stats);
+      record(m_name, t, pe->stats);
    }
 
    private:
@@ -1055,4 +1057,4 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
    std::ofstream m_logFile;
 };
 
-NBL_MAIN_FUNC(CompatibilityTest)
\ No newline at end of file
+NBL_MAIN_FUNC(CompatibilityTest)
diff --git a/common/include/nbl/examples/Benchmark/BenchmarkJson.h b/common/include/nbl/examples/Benchmark/BenchmarkJson.h
index dc7846848..e6d3fff24 100644
--- a/common/include/nbl/examples/Benchmark/BenchmarkJson.h
+++ b/common/include/nbl/examples/Benchmark/BenchmarkJson.h
@@ -13,6 +13,7 @@
 #include <fstream>
 #include <optional>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
@@ -85,32 +86,52 @@ inline std::optional<Baseline> loadBaselineFile(std::string label, const std::st
          continue;
          
       BaselineRow row;
-      row.psPerSample     = ps->get<double>();
-      row.registerCount   = r.at("regs").get<uint64_t>();
-      row.codeSizeBytes   = r.at("code_bytes").get<uint64_t>();
-      row.sharedMemBytes  = r.at("shared_mem_bytes").get<uint64_t>();
-      row.privateMemBytes = r.at("local_mem_bytes").get<uint64_t>();
-      row.stackBytes      = r.at("stack_bytes").get<uint64_t>();
-      row.subgroupSize    = r.at("subgroup_size").get<uint64_t>();
+      try
+      {
+         row.psPerSample = ps->get<double>();
+      }
+      catch (const std::exception&)
+      {
+         continue;
+      }
+
+      auto readU64 = [&](const char* key, uint64_t& out)
+      {
+         const auto it = r.find(key);
+         if (it != r.end() && it->is_number_unsigned())
+            out = it->get<uint64_t>();
+      };
+      readU64("regs", row.registerCount);
+      readU64("code_bytes", row.codeSizeBytes);
+      readU64("shared_mem_bytes", row.sharedMemBytes);
+      readU64("local_mem_bytes", row.privateMemBytes);
+      readU64("stack_bytes", row.stackBytes);
+      readU64("subgroup_size", row.subgroupSize);
 
       auto readUvec3 = [&](const char* key, nbl::hlsl::uint32_t3& out)
       {
-         const auto& a = r.at(key);
-         out.x         = a[0].get<uint32_t>();
-         out.y         = a[1].get<uint32_t>();
-         out.z         = a[2].get<uint32_t>();
+         const auto it = r.find(key);
+         if (it == r.end() || !it->is_array() || it->size() != 3)
+            return;
+         const auto& a = *it;
+         if (!a[0].is_number_unsigned() || !a[1].is_number_unsigned() || !a[2].is_number_unsigned())
+            return;
+         out.x = a[0].get<uint32_t>();
+         out.y = a[1].get<uint32_t>();
+         out.z = a[2].get<uint32_t>();
       };
       readUvec3("workgroup_size", row.workload.shape.workgroupSize);
       readUvec3("dispatch_groups", row.workload.shape.dispatchGroupCount);
-      row.workload.shape.samplesPerDispatch = r.at("samples_per_dispatch").get<uint64_t>();
-      row.workload.benchDispatches          = r.at("bench_dispatches").get<uint32_t>();
+      readU64("samples_per_dispatch", row.workload.shape.samplesPerDispatch);
+      if (const auto it = r.find("bench_dispatches"); it != r.end() && it->is_number_unsigned())
+         row.workload.benchDispatches = it->get<uint32_t>();
 
       rowsByName[makeKey(nameVec)] = row;
    }
    if (rowsByName.empty())
       return std::nullopt;
 
-   return Baseline {std::move(label), path, j.at("device"), std::move(rowsByName)};
+   return Baseline {std::move(label), path, j.contains("device") ? j["device"] : nullptr, std::move(rowsByName)};
 }
 
 // Writes a JSON report. Preserves rows in the prior file whose names weren't
diff --git a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h
index 3cd5d7a91..553e5a21b 100644
--- a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h
+++ b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h
@@ -344,9 +344,7 @@ class GPUBenchmarkHelper
       nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSetLayout> dsLayout = nullptr)
    {
       using namespace nbl;
-      const uint32_t idx = uint32_t(m_pipelines.size());
-      m_pipelines.push_back({.tag = tag});
-      PipelineEntry& slot = m_pipelines.back();
+      PipelineEntry slot = {.tag = tag};
 
       const asset::SPushConstantRange pcRange = {
          .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
@@ -359,7 +357,7 @@ class GPUBenchmarkHelper
       if (!layout)
       {
          benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): pipeline layout creation failed", tag);
-         return idx;
+         return InvalidPipelineIndex;
       }
 
       auto source = loadShader(variant, std::move(assetMgr));
@@ -367,7 +365,7 @@ class GPUBenchmarkHelper
       if (!shader)
       {
          benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): shader load/compile failed", tag);
-         return idx;
+         return InvalidPipelineIndex;
       }
 
       video::IGPUComputePipeline::SCreationParams pp = {};
@@ -381,7 +379,7 @@ class GPUBenchmarkHelper
       if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline) || !pipeline)
       {
          benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): createComputePipelines failed", tag);
-         return idx;
+         return InvalidPipelineIndex;
       }
 
       if (m_device->getEnabledFeatures().pipelineExecutableInfo)
@@ -408,6 +406,8 @@ class GPUBenchmarkHelper
 
       slot.layout   = std::move(layout);
       slot.pipeline = std::move(pipeline);
+      const uint32_t idx = uint32_t(m_pipelines.size());
+      m_pipelines.push_back(std::move(slot));
       return idx;
    }
 
@@ -652,6 +652,18 @@ class GPUBenchmarkHelper
    }
 
 protected:
+   static constexpr uint32_t InvalidPipelineIndex = std::numeric_limits<uint32_t>::max();
+
+   const PipelineEntry* getPipelineEntry(uint32_t idx, std::string_view context) const
+   {
+      if (idx == InvalidPipelineIndex || idx >= m_pipelines.size() || !m_pipelines[idx].pipeline)
+      {
+         benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "{}: pipeline is not available", context);
+         return nullptr;
+      }
+      return &m_pipelines[idx];
+   }
+
    std::vector<PipelineEntry> m_pipelines;
 
 private:
diff --git a/common/include/nbl/examples/Tester/FailureManifest.h b/common/include/nbl/examples/Tester/FailureManifest.h
new file mode 100644
index 000000000..a703e933e
--- /dev/null
+++ b/common/include/nbl/examples/Tester/FailureManifest.h
@@ -0,0 +1,331 @@
+#ifndef _NBL_COMMON_TESTER_FAILURE_MANIFEST_INCLUDED_
+#define _NBL_COMMON_TESTER_FAILURE_MANIFEST_INCLUDED_
+
+#include <nabla.h>
+
+#include "nlohmann/json.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <exception>
+#include <fstream>
+#include <map>
+#include <optional>
+#include <set>
+#include <span>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace nbl::examples::testing
+{
+
+struct FailureCase
+{
+   std::string check;
+   std::string side;
+   uint64_t iteration = 0;
+   uint32_t seed = 0;
+   double maxRelative = 0.0;
+   double maxAbsolute = 0.0;
+};
+
+struct FailureGroup
+{
+   std::string phase;
+   std::string id;
+   std::string name;
+   std::string logFile;
+   std::vector<FailureCase> cases;
+   uint32_t omittedCases = 0;
+};
+
+class FailureManifest
+{
+   public:
+   explicit FailureManifest(std::string suite = {}) : m_suite(std::move(suite)) {}
+
+   void setSuite(std::string suite) { m_suite = std::move(suite); }
+
+   void addGroupFailure(std::string_view phase, std::string_view id, std::string_view name, std::string_view logFile = {})
+   {
+      auto& group = groupFor(phase, id, name);
+      if (!logFile.empty())
+         group.logFile = std::string(logFile);
+   }
+
+   void addCase(std::string_view phase, std::string_view id, std::string_view name, std::string_view check, std::string_view side,
+      uint64_t iteration, uint32_t seed, double maxRelative, double maxAbsolute)
+   {
+      auto& group = groupFor(phase, id, name);
+      if (group.cases.size() >= MaxCasesPerGroup)
+      {
+         ++group.omittedCases;
+         return;
+      }
+
+      group.cases.push_back(FailureCase{
+         .check = std::string(check),
+         .side = std::string(side),
+         .iteration = iteration,
+         .seed = seed,
+         .maxRelative = maxRelative,
+         .maxAbsolute = maxAbsolute,
+      });
+   }
+
+   const std::vector<FailureGroup>& failures() const { return m_failures; }
+
+   nlohmann::json toJson() const
+   {
+      nlohmann::json doc;
+      doc["version"] = 1;
+      doc["suite"] = m_suite;
+      auto& failures = doc["failures"] = nlohmann::json::array();
+
+      for (const auto& group : m_failures)
+      {
+         nlohmann::json g;
+         g["phase"] = group.phase;
+         g["id"] = group.id;
+         g["name"] = group.name;
+         if (!group.logFile.empty())
+            g["log_file"] = group.logFile;
+
+         auto& cases = g["cases"] = nlohmann::json::array();
+         for (const auto& c : group.cases)
+         {
+            nlohmann::json row;
+            row["check"] = c.check;
+            row["side"] = c.side;
+            row["iteration"] = c.iteration;
+            row["seed"] = c.seed;
+            row["max_relative"] = c.maxRelative;
+            row["max_absolute"] = c.maxAbsolute;
+            cases.push_back(std::move(row));
+         }
+
+         if (group.omittedCases > 0)
+            g["omitted_cases"] = group.omittedCases;
+
+         failures.push_back(std::move(g));
+      }
+
+      return doc;
+   }
+
+   private:
+   static constexpr size_t MaxCasesPerGroup = 64;
+
+   FailureGroup& groupFor(std::string_view phase, std::string_view id, std::string_view name)
+   {
+      const std::string idString(id);
+      auto it = std::find_if(m_failures.begin(), m_failures.end(), [&](const FailureGroup& g) { return g.id == idString; });
+      if (it != m_failures.end())
+      {
+         if (it->name.empty())
+            it->name = std::string(name);
+         if (it->phase.empty())
+            it->phase = std::string(phase);
+         return *it;
+      }
+
+      m_failures.push_back(FailureGroup{
+         .phase = std::string(phase),
+         .id = idString,
+         .name = std::string(name),
+      });
+      return m_failures.back();
+   }
+
+   std::string m_suite;
+   std::vector<FailureGroup> m_failures;
+};
+
+class TestFilter
+{
+   public:
+   bool enabled() const { return m_enabled; }
+
+   void enable() { m_enabled = true; }
+
+   bool shouldRun(std::string_view id) const
+   {
+      return !m_enabled || m_ids.contains(std::string(id));
+   }
+
+   void add(std::string_view id)
+   {
+      m_enabled = true;
+      const auto first = id.find_first_not_of(" \t\r\n");
+      if (first == std::string_view::npos)
+         return;
+      const auto last = id.find_last_not_of(" \t\r\n");
+      m_ids.insert(std::string(id.substr(first, last - first + 1)));
+   }
+
+   void addSeed(std::string_view id, uint32_t seed)
+   {
+      add(id);
+      m_seeds[std::string(id)] = seed;
+   }
+
+   void addList(std::string_view ids)
+   {
+      m_enabled = true;
+      while (!ids.empty())
+      {
+         const auto comma = ids.find(',');
+         add(ids.substr(0, comma));
+         if (comma == std::string_view::npos)
+            return;
+         ids.remove_prefix(comma + 1);
+      }
+   }
+
+   std::optional<uint32_t> seedFor(std::string_view id) const
+   {
+      auto it = m_seeds.find(std::string(id));
+      if (it == m_seeds.end())
+         return {};
+      return it->second;
+   }
+
+   private:
+   bool m_enabled = false;
+   std::set<std::string> m_ids;
+   std::map<std::string, uint32_t> m_seeds;
+};
+
+struct RunControl
+{
+   bool valid = true;
+   bool skipBenchmarks = false;
+   std::string failedOutPath;
+   TestFilter filter;
+};
+
+inline bool addFailedIdsFromFile(TestFilter& filter, const std::string& path, nbl::system::ILogger* logger)
+{
+   filter.enable();
+   std::ifstream in(path);
+   if (!in.is_open())
+   {
+      if (logger)
+         logger->log("Failed to open failed-test manifest '%s'", nbl::system::ILogger::ELL_ERROR, path.c_str());
+      return false;
+   }
+
+   nlohmann::json doc;
+   try
+   {
+      in >> doc;
+   }
+   catch (const std::exception& e)
+   {
+      if (logger)
+         logger->log("Failed to parse failed-test manifest '%s': %s", nbl::system::ILogger::ELL_ERROR, path.c_str(), e.what());
+      return false;
+   }
+
+   const auto failuresIt = doc.find("failures");
+   if (failuresIt == doc.end() || !failuresIt->is_array())
+   {
+      if (logger)
+         logger->log("Failed-test manifest '%s' does not contain a failures array", nbl::system::ILogger::ELL_ERROR, path.c_str());
+      return false;
+   }
+
+   for (const auto& failure : *failuresIt)
+   {
+      if (!failure.is_object())
+         continue;
+      const auto idIt = failure.find("id");
+      if (idIt != failure.end() && idIt->is_string())
+      {
+         const std::string id = idIt->get<std::string>();
+         const auto casesIt = failure.find("cases");
+         if (casesIt != failure.end() && casesIt->is_array())
+         {
+            const auto seedIt = std::find_if(casesIt->begin(), casesIt->end(), [](const nlohmann::json& row) {
+               if (!row.is_object())
+                  return false;
+               const auto it = row.find("seed");
+               return it != row.end() && it->is_number_integer();
+            });
+            if (seedIt != casesIt->end())
+            {
+               filter.addSeed(id, (*seedIt)["seed"].get<uint32_t>());
+               continue;
+            }
+         }
+         filter.add(id);
+      }
+   }
+
+   return true;
+}
+
+inline RunControl parseRunControl(std::span<const std::string> argv, nbl::system::ILogger* logger)
+{
+   RunControl out;
+
+   for (size_t i = 1; i < argv.size(); ++i)
+   {
+      const std::string& arg = argv[i];
+      if (arg == "--skip-benchmarks")
+         out.skipBenchmarks = true;
+      else if (arg == "--failed-out" && i + 1 < argv.size())
+         out.failedOutPath = argv[++i];
+      else if (arg.starts_with("--failed-out="))
+         out.failedOutPath = arg.substr(std::string("--failed-out=").size());
+      else if (arg == "--test" && i + 1 < argv.size())
+         out.filter.addList(argv[++i]);
+      else if (arg.starts_with("--test="))
+         out.filter.addList(std::string_view(arg).substr(std::string_view("--test=").size()));
+      else if (arg == "--rerun-failed" && i + 1 < argv.size())
+      {
+         if (!addFailedIdsFromFile(out.filter, argv[++i], logger))
+            out.valid = false;
+      }
+      else if (arg.starts_with("--rerun-failed="))
+      {
+         if (!addFailedIdsFromFile(out.filter, arg.substr(std::string("--rerun-failed=").size()), logger))
+            out.valid = false;
+      }
+   }
+
+   if (out.filter.enabled())
+      out.skipBenchmarks = true;
+
+   return out;
+}
+
+inline bool writeFailureManifestFile(const FailureManifest& manifest, const std::string& path, nbl::system::ILogger* logger)
+{
+   std::ofstream out(path, std::ios::out | std::ios::trunc);
+   if (!out.is_open())
+   {
+      if (logger)
+         logger->log("Failed to open failed-test manifest '%s' for writing", nbl::system::ILogger::ELL_ERROR, path.c_str());
+      return false;
+   }
+
+   out << manifest.toJson().dump(3) << '\n';
+   if (!out.good())
+   {
+      if (logger)
+         logger->log("Failed to write failed-test manifest '%s'", nbl::system::ILogger::ELL_ERROR, path.c_str());
+      return false;
+   }
+
+   if (logger)
+      logger->log("Wrote failed-test manifest '%s' with %llu failed groups", nbl::system::ILogger::ELL_INFO,
+         path.c_str(), static_cast<unsigned long long>(manifest.failures().size()));
+   return true;
+}
+
+} // namespace nbl::examples::testing
+
+#endif
diff --git a/common/include/nbl/examples/Tester/ITester.h b/common/include/nbl/examples/Tester/ITester.h
index 8fd4c6639..27dfccff2 100644
--- a/common/include/nbl/examples/Tester/ITester.h
+++ b/common/include/nbl/examples/Tester/ITester.h
@@ -3,6 +3,7 @@
 
 #include <nabla.h>
 #include <nbl/system/to_string.h>
+#include <nbl/examples/Tester/FailureManifest.h>
 #include <ranges>
 #include <nbl/builtin/hlsl/testing/relative_approx_compare.hlsl>
 #include <nbl/builtin/hlsl/testing/approx_compare.hlsl>
@@ -171,6 +172,7 @@ class ITester
 
    bool performTestsAndVerifyResults(const std::string& logFileName)
    {
+      m_failureLogFile = logFileName;
       m_logFile.open(logFileName, std::ios::out | std::ios::trunc);
       if (!m_logFile.is_open())
          m_logger->log("Failed to open log file!", system::ILogger::ELL_ERROR);
@@ -197,6 +199,8 @@ class ITester
       core::vector<TestResults> gpuTestResults = performGpuTests(inputTestValues);
 
       bool pass = verifyAllTestResults(cpuTestResults, gpuTestResults, exceptedTestResults);
+      if (!pass && m_failureManifest)
+         m_failureManifest->addGroupFailure(m_failurePhase, m_failureId, m_failureName, m_failureLogFile);
 
       m_logger->log("TESTS DONE.", system::ILogger::ELL_PERFORMANCE);
       reloadSeed();
@@ -205,6 +209,20 @@ class ITester
       return pass;
    }
 
+   void setFailureRecordContext(nbl::examples::testing::FailureManifest* manifest, std::string phase, std::string id, std::string name)
+   {
+      m_failureManifest = manifest;
+      m_failurePhase = std::move(phase);
+      m_failureId = std::move(id);
+      m_failureName = std::move(name);
+   }
+
+   void setSeed(uint32_t seed)
+   {
+      m_seed = seed;
+      m_mersenneTwister = std::mt19937(m_seed);
+   }
+
    virtual ~ITester()
    {
       m_outputBufferAllocation.memory->unmap();
@@ -339,6 +357,13 @@ class ITester
          ss << " DIFFERENCE: " << system::to_string(hlsl::abs(expectedVal - testVal));
       ss << " MAX RELATIVE: " << system::to_string(maxRelativeDifference) << " MAX ABSOLUTE " << system::to_string(maxAbsoluteDifference) << '\n';
 
+      if (m_failureManifest)
+      {
+         const char* side = testType == TestType::CPU ? "CPU" : "GPU";
+         m_failureManifest->addCase(m_failurePhase, m_failureId, m_failureName, memberName, side,
+            testIteration, seed, maxRelativeDifference, maxAbsoluteDifference);
+      }
+
       m_logger->log("%s", system::ILogger::ELL_ERROR, ss.str().c_str());
       m_logFile << ss.str() << '\n';
    }
@@ -439,6 +464,11 @@ class ITester
    uint32_t m_seed;
    std::ofstream m_logFile;
    core::unordered_map<std::string, hlsl::testing::SMaxError> m_maxErrors;
+   nbl::examples::testing::FailureManifest* m_failureManifest = nullptr;
+   std::string m_failurePhase;
+   std::string m_failureId;
+   std::string m_failureName;
+   std::string m_failureLogFile;
 };
 
-#endif
\ No newline at end of file
+#endif