From 1b54b42b5d31f1f1d72f2760438f2a323ab44e73 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 13 May 2026 10:50:22 +0300 Subject: [PATCH 1/4] Refactor GPU benchmarking with unified helper framework Refactored the GPU benchmarking system to use new helper classes (`IBenchmark`, `GPUBenchmarkHelper`, and related types), replacing manual pipeline nuffer, and query management with a unified, extensible framework. Benchmarks now use a common `Aggregator` for session orchestration, CLI parsing, baseline comparison, and reporting. Added support for hierarchical variant names, focus/median-of-K runs, robust JSON output, and improved console reporting. Updated main application and all benchmarks to use the new structure, improving maintainability and extensibility. --- .../benchmarks/CDiscreteSamplerBenchmark.h | 398 ++- .../benchmarks/CSamplerBenchmark.h | 273 +-- 37_HLSLSamplingTests/main.cpp | 393 ++- 64_EmulatedFloatTest/main.cpp | 2131 ++++++++--------- .../nbl/examples/Benchmark/BenchmarkCli.h | 125 + .../nbl/examples/Benchmark/BenchmarkConsole.h | 526 ++++ .../nbl/examples/Benchmark/BenchmarkJson.h | 285 +++ .../nbl/examples/Benchmark/BenchmarkTypes.h | 211 ++ .../examples/Benchmark/GPUBenchmarkHelper.h | 693 ++++++ .../nbl/examples/Benchmark/IBenchmark.h | 409 ++++ 10 files changed, 3634 insertions(+), 1810 deletions(-) create mode 100644 common/include/nbl/examples/Benchmark/BenchmarkCli.h create mode 100644 common/include/nbl/examples/Benchmark/BenchmarkConsole.h create mode 100644 common/include/nbl/examples/Benchmark/BenchmarkJson.h create mode 100644 common/include/nbl/examples/Benchmark/BenchmarkTypes.h create mode 100644 common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h create mode 100644 common/include/nbl/examples/Benchmark/IBenchmark.h diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h index b2a2fad9a..d6289f54b 100644 --- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h @@ -6,187 +6,144 @@ #include #include #include "app_resources/common/discrete_sampler_bench.hlsl" +#include "nbl/examples/Benchmark/IBenchmark.h" +#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h" #include using namespace nbl; -class CDiscreteSamplerBenchmark +class CDiscreteSamplerBenchmark : public GPUBenchmark { public: - struct SetupData + // Declared up-front because it's used as the index domain for m_pipelineIdx[] + // (a member-array bound needs the type complete in declaration order). + enum class SamplerKind : uint32_t { - core::smart_refctd_ptr device; - core::smart_refctd_ptr api; - core::smart_refctd_ptr assetMgr; - core::smart_refctd_ptr logger; - IPhysicalDevice* physicalDevice; - std::string packedAliasAShaderKey; - std::string packedAliasBShaderKey; - std::string cumProbShaderKey; - std::string cumProbYoloShaderKey; - std::string cumProbEytzingerShaderKey; - uint32_t computeFamilyIndex; - uint32_t dispatchGroupCount; + AliasPackedA = 0, + AliasPackedB, + CumProbCompare, + CumProbYolo, + CumProbEytzinger, + Count }; - void setup(const SetupData& data) + struct SetupData { - m_device = data.device; - m_logger = data.logger; - m_assetMgr = data.assetMgr; - m_dispatchGroupCount = data.dispatchGroupCount; - m_physicalDevice = data.physicalDevice; + core::smart_refctd_ptr assetMgr; + // Each pipeline is independent; main.cpp can pick precompiled or runtime per + // pipeline by passing ShaderVariant::Precompiled(get_spirv_key<...>()) or + // ShaderVariant::FromSource(path, defines) respectively. + GPUBenchmarkHelper::ShaderVariant packedAliasAVariant; + GPUBenchmarkHelper::ShaderVariant packedAliasBVariant; + GPUBenchmarkHelper::ShaderVariant cumProbVariant; + GPUBenchmarkHelper::ShaderVariant cumProbYoloVariant; + GPUBenchmarkHelper::ShaderVariant cumProbEytzingerVariant; + hlsl::uint32_t3 dispatchGroupCount; + uint64_t targetBudgetMs = 400; // wall-clock budget per sweep row + // N values the sweep cycles through. Dispatch count per row is auto-sized + // by runTimedBudgeted to hit the budget. + std::span sweepNs; + }; - m_queue = m_device->getQueue(data.computeFamilyIndex, 0); + // Shape is derivable from SetupData; expose it so the caller can use it + // both to configure the bench and to build the matching RunContext for the + // span that runs this bench + static WorkloadShape shapeFor(const SetupData& data) + { + const uint32_t totalThreads = data.dispatchGroupCount.x * data.dispatchGroupCount.y * data.dispatchGroupCount.z * WORKGROUP_SIZE; + const uint64_t samplesPerDispatch = uint64_t(totalThreads) * uint64_t(BENCH_ITERS); + return { + .workgroupSize = {WORKGROUP_SIZE, 1u, 1u}, + .dispatchGroupCount = data.dispatchGroupCount, + .samplesPerDispatch = samplesPerDispatch, + }; + } - // Staging-upload utility. Without this, BDA buffers land in host-visible (system RAM) - // and every sampler load becomes a PCIe round-trip instead of hitting VRAM/L2. - m_utils = IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr(m_logger)); + CDiscreteSamplerBenchmark(Aggregator& aggregator, const SetupData& data) + : GPUBenchmark(aggregator, GPUBenchmark::SetupData{ + .name = {}, // per-row names synthesized at run time + .warmupDispatches = 0, + .shape = shapeFor(data), + .targetBudgetMs = data.targetBudgetMs, + }) + { + const uint32_t totalThreads = data.dispatchGroupCount.x * data.dispatchGroupCount.y * data.dispatchGroupCount.z * WORKGROUP_SIZE; - // Command pool + buffers - m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf); + m_assetMgr = data.assetMgr; + m_sweepNs = data.sweepNs; - // Timestamp query pool + for (const uint32_t N : m_sweepNs) { - IQueryPool::SCreationParams qp = {}; - qp.queryType = IQueryPool::TYPE::TIMESTAMP; - qp.queryCount = 2; - qp.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; - m_queryPool = m_device->createQueryPool(qp); + const std::string nStr = std::format("N={}", N); + for (const auto& v : kSweepVariants) + registerVariant({nStr, v.family, v.leaf}); } - const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; - // Shared output buffer (size only depends on thread count). GPU writes via BDA and - // nothing reads it on the CPU, so pin it to device-local VRAM. - { - IGPUBuffer::SCreationParams bp = {}; - bp.size = totalThreads * sizeof(uint32_t); - bp.usage = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - m_outputBuf = m_device->createBuffer(std::move(bp)); - IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); - m_device->allocate(reqs, m_outputBuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - } - - // Pipelines (N-independent; only push constants change per run) - m_packedAliasAPipeline = createPipeline(data.packedAliasAShaderKey, m_packedAliasAPplnLayout, "alias-packed-A"); - m_packedAliasBPipeline = createPipeline(data.packedAliasBShaderKey, m_packedAliasBPplnLayout, "alias-packed-B"); - m_cumProbPipeline = createPipeline(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator"); - m_cumProbYoloPipeline = createPipeline(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo"); - m_cumProbEytzingerPipeline = createPipeline(data.cumProbEytzingerShaderKey, m_cumProbEytzingerPplnLayout, "cumprob-eytzinger"); + // nothing reads it on the CPU. + m_outputBuf = createBdaOutputBuffer(totalThreads * sizeof(uint32_t)).buf; + + // Pipelines (N-independent; only push constants change per run). Indices + // into m_pipelines (GPUBenchmarkHelper) are stored in the same order as SamplerKind + // so the sweep's variant table can index by enum directly. + m_pipelineIdx[static_cast(SamplerKind::AliasPackedA)] = createPipeline(data.packedAliasAVariant, m_assetMgr, sizeof(PackedAliasABPushConstants), "alias-packed-A"); + m_pipelineIdx[static_cast(SamplerKind::AliasPackedB)] = createPipeline(data.packedAliasBVariant, m_assetMgr, sizeof(PackedAliasABPushConstants), "alias-packed-B"); + m_pipelineIdx[static_cast(SamplerKind::CumProbCompare)] = createPipeline(data.cumProbVariant, m_assetMgr, sizeof(CumProbPushConstants), "cumprob-comparator"); + m_pipelineIdx[static_cast(SamplerKind::CumProbYolo)] = createPipeline(data.cumProbYoloVariant, m_assetMgr, sizeof(CumProbPushConstants), "cumprob-yolo"); + m_pipelineIdx[static_cast(SamplerKind::CumProbEytzinger)] = createPipeline(data.cumProbEytzingerVariant, m_assetMgr, sizeof(CumProbPushConstants), "cumprob-eytzinger"); } - // DispatchScheduler: uint32_t N -> std::pair. - // Lets the caller trade wall-clock for statistical stability per size: - // big-N runs are DRAM-bound and need fewer dispatches to hit the same total sample count. - struct DispatchCounts + // Rows are synthesized per (N, variant), not a single named entry, so + // each row checks cli.focusVariants individually. The aggregator's silent + // flag selects which half (focused / unfocused) we contribute to. + void run() override { - uint32_t warmup; - uint32_t bench; - }; - - template - void runSweep(const std::vector& tableSizes, DispatchScheduler scheduler) - { - const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; - m_logger->log("=== GPU Discrete Sampler Benchmark sweep (%u threads * %u iters/thread; wg=%u; dispatches chosen per-N) ===", - ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE); - m_logger->log("%12s | %-34s | %12s | %12s | %12s | %10s", ILogger::ELL_PERFORMANCE, - "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches"); + const bool focusedPhase = isFocusPhase(); + // Warmup is small and fixed; budgeted measurement auto-sizes the + // measured-dispatch count to hit getTargetBudgetMs(). + constexpr uint32_t kWarmupDispatches = 64; - for (uint32_t N : tableSizes) + for (const uint32_t N : m_sweepNs) { - const DispatchCounts dc = scheduler(N); - buildAndUpload(N); - // Packed A wins N<=16k; Packed B wins N>=32k. SoA and Packed C were dominated - // across every N measured, removed from the sweep. - runSingle(N, "AliasTable (packed A, 4 B)", m_packedAliasAPipeline, m_packedAliasAPplnLayout, SamplerKind::AliasPackedA, dc.warmup, dc.bench); - runSingle(N, "AliasTable (packed B, 8 B)", m_packedAliasBPipeline, m_packedAliasBPplnLayout, SamplerKind::AliasPackedB, dc.warmup, dc.bench); - runSingle(N, "CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, SamplerKind::CumProbCompare, dc.warmup, dc.bench); - runSingle(N, "CumulativeProbability (YOLO)", m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo, dc.warmup, dc.bench); - runSingle(N, "CumulativeProbability (Eytzinger)", m_cumProbEytzingerPipeline, m_cumProbEytzingerPplnLayout, SamplerKind::CumProbEytzinger, dc.warmup, dc.bench); - releaseTables(); + const std::string nStr = std::format("N={}", N); + bool built = false; + for (const auto& [family, leaf, kind] : kSweepVariants) + { + core::vector name = {nStr, family, leaf}; + const bool inFocus = isFocused(name); + const bool shouldRun = focusedPhase ? inFocus : !inFocus; + if (!shouldRun) + continue; + if (!built) + { + buildAndUpload(N); + built = true; + } + runSingle(N, std::move(name), kind, kWarmupDispatches); + } + if (built) + releaseTables(); } } - // Convenience: sweep with fixed dispatch counts for every size. - void runSweep(const std::vector& tableSizes, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) - { - runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts - { return {warmupIterations, benchmarkIterations}; }); - } - private: - enum class SamplerKind + // (family, leaf, kind) for every variant the sweep runs. + struct SweepVariant { - AliasPackedA, - AliasPackedB, - CumProbCompare, - CumProbYolo, - CumProbEytzinger + const char* family; // e.g. "AliasTable" + const char* leaf; // e.g. "packed A, 4 B" + SamplerKind kind; + }; + static constexpr SweepVariant kSweepVariants[] = { + {"AliasTable", "packed A, 4 B", SamplerKind::AliasPackedA}, + {"AliasTable", "packed B, 8 B", SamplerKind::AliasPackedB}, + {"CumulativeProbability", "comparator", SamplerKind::CumProbCompare}, + {"CumulativeProbability", "YOLO", SamplerKind::CumProbYolo}, + {"CumulativeProbability", "Eytzinger", SamplerKind::CumProbEytzinger}, }; - template - core::smart_refctd_ptr createPipeline(const std::string& shaderKey, core::smart_refctd_ptr& outLayout, const char* tag) - { - const SPushConstantRange pcRange = { - .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(PushConstantT)}; - auto layout = m_device->createPipelineLayout({&pcRange, 1}); - if (!layout) - m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", ILogger::ELL_ERROR, tag); - - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto bundle = m_assetMgr->getAsset(shaderKey, lp); - auto source = IAsset::castDown(bundle.getContents()[0]); - auto shader = m_device->compileShader({.source = source.get()}); - if (!shader) - m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", ILogger::ELL_ERROR, tag); - - IGPUComputePipeline::SCreationParams pp = {}; - pp.layout = layout.get(); - pp.shader.shader = shader.get(); - pp.shader.entryPoint = "main"; - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - pp.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - - core::smart_refctd_ptr pipeline; - if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline)) - m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", ILogger::ELL_ERROR, tag); - - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(pipeline->getExecutableInfo()); - m_logger->log("%s Sampling Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, tag, report.c_str()); - } - outLayout = std::move(layout); - return pipeline; - } - - core::smart_refctd_ptr createBdaBuffer(const void* srcData, size_t bytes) - { - IGPUBuffer::SCreationParams bp = {}; - bp.size = bytes; - bp.usage = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | - IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | - IGPUBuffer::EUF_TRANSFER_DST_BIT; - - core::smart_refctd_ptr buf; - auto future = m_utils->createFilledDeviceLocalBufferOnDedMem( - SIntendedSubmitInfo {.queue = m_queue}, std::move(bp), srcData); - future.move_into(buf); - return buf; - } - - void buildAndUpload(uint32_t N) + void buildAndUpload(const uint32_t N) { m_currentN = N; @@ -204,8 +161,8 @@ class CDiscreteSamplerBenchmark std::vector aliasPdf; m_aliasTableN = sampling::AliasTableBuilder::build({weights}, aliasProb, aliasIdx, aliasPdf); - constexpr uint32_t kPackedLog2N = 26u; - std::vector packedA(m_aliasTableN); + constexpr uint32_t kPackedLog2N = 26u; + std::vector packedA(m_aliasTableN); std::vector> packedB(m_aliasTableN); sampling::AliasTableBuilder::packA({aliasProb}, {aliasIdx}, packedA.data()); sampling::AliasTableBuilder::packB({aliasProb}, {aliasIdx}, {aliasPdf}, packedB.data()); @@ -236,104 +193,45 @@ class CDiscreteSamplerBenchmark m_cumProbEytzingerBuf = nullptr; } - void runSingle(uint32_t N, const char* name, const core::smart_refctd_ptr& pipeline, const core::smart_refctd_ptr& layout, SamplerKind kind, uint32_t warmupIterations, uint32_t benchmarkIterations) + void runSingle(uint32_t N, core::vector name, SamplerKind kind, uint32_t warmupIterations) { - m_device->waitIdle(); - - // Everything (warmup, timestamped bench, cooldown) goes into ONE cmdbuf and ONE - // submit. Serial submissions with semaphore waits between them would add sync cost - // to every dispatch and prevent the driver from overlapping adjacent dispatches. - // With a single cmdbuf the driver pipelines freely, and GPU memory latency is - // hidden by warp hyperthreading rather than by cross-submit overlap. - // - // Layout: [warmup dispatches] [ts 0] [bench dispatches] [ts 1] [cooldown dispatches] - // Warmup brings clocks + caches to steady state before ts 0. Cooldown keeps the - // same steady-state context alive across ts 1 so the trailing bench dispatches - // don't measure a tail where the GPU is already winding down. - const uint32_t cooldownIterations = warmupIterations; - - m_benchCmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchCmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_benchCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_benchCmdbuf->bindComputePipeline(pipeline.get()); - - if (kind == SamplerKind::AliasPackedA || kind == SamplerKind::AliasPackedB) - { - PackedAliasABPushConstants pc = {}; - pc.entriesAddress = (kind == SamplerKind::AliasPackedA ? m_packedAliasABuf : m_packedAliasBBuf)->getDeviceAddress(); - pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = m_aliasTableN; - m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - } - else - { - CumProbPushConstants pc = {}; - const auto& buf = (kind == SamplerKind::CumProbEytzinger) ? m_cumProbEytzingerBuf : m_cumProbBuf; - pc.cumProbAddress = buf->getDeviceAddress(); - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = N; - m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - } - - for (uint32_t i = 0u; i < warmupIterations; ++i) - m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); - for (uint32_t i = 0u; i < benchmarkIterations; ++i) - m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); - for (uint32_t i = 0u; i < cooldownIterations; ++i) - m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchCmdbuf->end(); - - auto semaphore = m_device->createSemaphore(0u); - const IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}}; - const IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = 1u, .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = benchCmds; - submit.signalSemaphores = signalSem; - m_queue->submit({&submit, 1u}); - - m_device->waitIdle(); - - uint64_t timestamps[2] = {}; - const auto flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | - core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); - m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); - - constexpr uint32_t benchIters = BENCH_ITERS; - const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); - const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; - const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE); - const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters); - const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples); - const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns; - const float64_t elapsed_ms = elapsed_ns * 1e-6; - - m_logger->log("%12u | %-34s | %12.3f | %12.3f | %12.3f | %10u", - ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations); + // Pipeline + push constants are bound *once* in bindOnce, the inner loop is just + // dispatch(...). Putting binds inside dispatchOne would inflate ps/sample on the + // tighter samplers. + const PipelineEntry& pe = m_pipelines[m_pipelineIdx[size_t(kind)]]; + + const TimingResult timingResult = runTimedBudgeted(warmupIterations, getTargetBudgetMs(), + [&](IGPUCommandBuffer* cb) + { + if (kind == SamplerKind::AliasPackedA || kind == SamplerKind::AliasPackedB) + { + PackedAliasABPushConstants pc = {}; + pc.entriesAddress = (kind == SamplerKind::AliasPackedA ? m_packedAliasABuf : m_packedAliasBBuf)->getDeviceAddress(); + pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = m_aliasTableN; + defaultBindAndPush(cb, pe, pc); + } + else + { + CumProbPushConstants pc = {}; + const auto& buf = (kind == SamplerKind::CumProbEytzinger) ? m_cumProbEytzingerBuf : m_cumProbBuf; + pc.cumProbAddress = buf->getDeviceAddress(); + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = N; + defaultBindAndPush(cb, pe, pc); + } + }, + [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); }, + samplesForCurrentRow()); + + record(std::move(name), timingResult, pe.stats); } - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_logger; - core::smart_refctd_ptr m_assetMgr; - core::smart_refctd_ptr m_utils; - core::smart_refctd_ptr m_cmdpool; - core::smart_refctd_ptr m_benchCmdbuf; - core::smart_refctd_ptr m_queryPool; + core::smart_refctd_ptr m_assetMgr; - // Pipelines (set up once) - core::smart_refctd_ptr m_packedAliasAPplnLayout; - core::smart_refctd_ptr m_packedAliasAPipeline; - core::smart_refctd_ptr m_packedAliasBPplnLayout; - core::smart_refctd_ptr m_packedAliasBPipeline; - core::smart_refctd_ptr m_cumProbPplnLayout; - core::smart_refctd_ptr m_cumProbPipeline; - core::smart_refctd_ptr m_cumProbYoloPplnLayout; - core::smart_refctd_ptr m_cumProbYoloPipeline; - core::smart_refctd_ptr m_cumProbEytzingerPplnLayout; - core::smart_refctd_ptr m_cumProbEytzingerPipeline; + // Indices into m_pipelines (GPUBenchmarkHelper), indexed by SamplerKind. + uint32_t m_pipelineIdx[size_t(SamplerKind::Count)] = {}; // Per-N data buffers (rebuilt each sweep step). pdf[] is shared between A and B. core::smart_refctd_ptr m_aliasPdfBuf; @@ -344,11 +242,9 @@ class CDiscreteSamplerBenchmark // Shared core::smart_refctd_ptr m_outputBuf; - IQueue* m_queue = nullptr; - IPhysicalDevice* m_physicalDevice = nullptr; - uint32_t m_dispatchGroupCount = 0; - uint32_t m_currentN = 0; - uint32_t m_aliasTableN = 0; + uint32_t m_currentN = 0; + uint32_t m_aliasTableN = 0; + std::span m_sweepNs; }; #endif diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h index 4f63c6fde..dd7149829 100644 --- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h @@ -7,249 +7,54 @@ #include #include "nbl/examples/examples.hpp" -#include "../app_resources/common/sampler_bench_pc.hlsl" +#include "nbl/examples/Benchmark/IBenchmark.h" +#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h" +#include "app_resources/common/sampler_bench_pc.hlsl" using namespace nbl; // Measures GPU execution time of a sampler shader using GPU timestamp queries. -class CSamplerBenchmark +// Output is implicit BDA addressed via SamplerBenchPushConstants. GPU plumbing +// (pipeline / buffer / timestamp queries) comes from GPUBenchmarkHelper; the +// bench-side glue here is PC layout + per-run dispatch + result recording. +class CSamplerBenchmark : public GPUBenchmark { -public: - struct SetupData - { - core::smart_refctd_ptr device; - core::smart_refctd_ptr api; - core::smart_refctd_ptr assetMgr; - core::smart_refctd_ptr logger; - video::IPhysicalDevice* physicalDevice; - uint32_t computeFamilyIndex; - std::string shaderKey; - uint32_t dispatchGroupCount; // workgroup count = testBatchCount - uint32_t samplesPerDispatch; // dispatchGroupCount * WorkgroupSize * benchIters - size_t inputBufferBytes; // sizeof(InputType) * samplesPerDispatch - size_t outputBufferBytes; // sizeof(ResultType) * samplesPerDispatch - }; - - void setup(const SetupData& data) - { - m_device = data.device; - m_logger = data.logger; - m_dispatchGroupCount = data.dispatchGroupCount; - - // Single cmdbuf holds [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches] - // so the driver can pipeline adjacent dispatches and the trailing bench dispatches - // aren't measured in a winding-down tail. - m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchmarkCmdbuf)) - m_logger->log("CSamplerBenchmark: failed to create benchmark cmdbuf", system::ILogger::ELL_ERROR); - - // Timestamp query pool (2 queries: before and after) - { - video::IQueryPool::SCreationParams qparams = {}; - qparams.queryType = video::IQueryPool::TYPE::TIMESTAMP; - qparams.queryCount = 2; - qparams.pipelineStatisticsFlags = video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; - m_queryPool = m_device->createQueryPool(qparams); - if (!m_queryPool) - m_logger->log("CSamplerBenchmark: failed to create query pool", system::ILogger::ELL_ERROR); - } - - // Load and compile shader - core::smart_refctd_ptr shader; - { - asset::IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto bundle = data.assetMgr->getAsset(data.shaderKey, lp); - const auto assets = bundle.getContents(); - if (assets.empty()) - { - m_logger->log("CSamplerBenchmark: failed to load shader", system::ILogger::ELL_ERROR); - return; - } - auto source = asset::IAsset::castDown(assets[0]); - shader = m_device->compileShader({ source.get() }); - } - - // Descriptor set layout: binding 0 = input SSBO, binding 1 = output SSBO - video::IGPUDescriptorSetLayout::SBinding bindings[2] = { - { .binding = 0, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, .count = 1 }, - { .binding = 1, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, .count = 1 } - }; - auto dsLayout = m_device->createDescriptorSetLayout(bindings); - - const asset::SPushConstantRange pcRange = { - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(SamplerBenchPushConstants), - }; - m_pplnLayout = m_device->createPipelineLayout({&pcRange, 1}, core::smart_refctd_ptr(dsLayout)); - - { - video::IGPUComputePipeline::SCreationParams pparams = {}; - pparams.layout = m_pplnLayout.get(); - pparams.shader.entryPoint = "main"; - pparams.shader.shader = shader.get(); - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - pparams.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - if (!m_device->createComputePipelines(nullptr, { &pparams, 1 }, &m_pipeline)) - m_logger->log("CSamplerBenchmark: failed to create compute pipeline", system::ILogger::ELL_ERROR); - - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - m_executableReport = system::to_string(m_pipeline->getExecutableInfo()); - } - - // Allocate input buffer (device-local VRAM, zero-filled via cmdFillBuffer; correctness - // irrelevant for benchmarking but we want deterministic input, not garbage) - core::smart_refctd_ptr inputBuf; - { - video::IGPUBuffer::SCreationParams bparams = {}; - bparams.size = data.inputBufferBytes; - bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_TRANSFER_DST_BIT; - inputBuf = m_device->createBuffer(std::move(bparams)); - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); - m_inputAlloc = m_device->allocate(reqs, inputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE); - if (!m_inputAlloc.isValid()) - m_logger->log("CSamplerBenchmark: failed to allocate input buffer memory", system::ILogger::ELL_ERROR); - } - - // Allocate output buffer (device-local VRAM, GPU writes, never read back) - core::smart_refctd_ptr outputBuf; - { - video::IGPUBuffer::SCreationParams bparams = {}; - bparams.size = data.outputBufferBytes; - bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - outputBuf = m_device->createBuffer(std::move(bparams)); - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); - m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - if (!m_outputAlloc.isValid()) - m_logger->log("CSamplerBenchmark: failed to allocate output buffer memory", system::ILogger::ELL_ERROR); - m_outputAddress = outputBuf->getDeviceAddress(); - } - - // Zero-fill the input buffer once on the GPU - { - core::smart_refctd_ptr initCmdbuf; - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &initCmdbuf); - initCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - const asset::SBufferRange range = { .offset = 0, .size = data.inputBufferBytes, .buffer = inputBuf }; - initCmdbuf->fillBuffer(range, 0u); - initCmdbuf->end(); - - auto queue = m_device->getQueue(data.computeFamilyIndex, 0); - const video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = { {.cmdbuf = initCmdbuf.get()} }; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = cmds; - queue->submit({&submit, 1u}); - m_device->waitIdle(); - } - - // Descriptor set: bind both buffers - auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }); - m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); - { - video::IGPUDescriptorSet::SDescriptorInfo info[2]; - info[0].desc = core::smart_refctd_ptr(inputBuf); - info[0].info.buffer = { .offset = 0, .size = data.inputBufferBytes }; - info[1].desc = core::smart_refctd_ptr(outputBuf); - info[1].info.buffer = { .offset = 0, .size = data.outputBufferBytes }; - video::IGPUDescriptorSet::SWriteDescriptorSet writes[2] = { - { .dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &info[0] }, - { .dstSet = m_ds.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &info[1] } - }; - m_device->updateDescriptorSets(writes, {}); - } - - m_queue = m_device->getQueue(data.computeFamilyIndex, 0); - m_samplesPerDispatch = data.samplesPerDispatch; - m_physicalDevice = data.physicalDevice; - } - - void logPipelineReport(const std::string& name) const + public: + struct SetupData : GPUBenchmark::SetupData { - if (!m_executableReport.empty()) - m_logger->log("%s Sampler Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, name.c_str(), m_executableReport.c_str()); - } - - void run(const std::string& samplerName, const std::string& mode, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) - { - m_device->waitIdle(); - - const uint32_t cooldownIterations = warmupIterations; - - m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_benchmarkCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get()); - m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); - { - SamplerBenchPushConstants pc = { .outputAddress = m_outputAddress }; - m_benchmarkCmdbuf->pushConstants(m_pplnLayout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc); - } - for (uint32_t i = 0u; i < warmupIterations; ++i) - m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); - for (uint32_t i = 0u; i < benchmarkIterations; ++i) - m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); - for (uint32_t i = 0u; i < cooldownIterations; ++i) - m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchmarkCmdbuf->end(); + core::smart_refctd_ptr assetMgr; + GPUBenchmarkHelper::ShaderVariant variant; // precompiled key OR source path + defines + size_t outputBufferBytes; // sizeof(uint32_t) * threadsPerDispatch + }; - auto semaphore = m_device->createSemaphore(0u); - const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} }; - const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} - }; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = benchCmds; - submit.signalSemaphores = signalSem; - m_queue->submit({&submit, 1u}); - - m_device->waitIdle(); - - uint64_t timestamps[2] = {}; - const auto flags = core::bitflag(video::IQueryPool::RESULTS_FLAGS::_64_BIT) | - core::bitflag(video::IQueryPool::RESULTS_FLAGS::WAIT_BIT); - m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); - - const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); - const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; - const uint64_t total_samples = uint64_t(benchmarkIterations) * uint64_t(m_samplesPerDispatch); - const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(total_samples); - const float64_t gsamples_per_s = float64_t(total_samples) / elapsed_ns; - const float64_t elapsed_ms = elapsed_ns * 1e-6; + CSamplerBenchmark(Aggregator& aggregator, const SetupData& data) + : GPUBenchmark(aggregator, data) // slicing-copy of the GPUBenchmark::SetupData base + { + auto bda = createBdaOutputBuffer(data.outputBufferBytes); + m_outputBuf = std::move(bda.buf); + m_outputAddress = bda.address; - m_logger->log("[Benchmark] %-28s | %-38s | %12.3f | %12.3f | %12.3f", - system::ILogger::ELL_PERFORMANCE, - samplerName.c_str(), mode.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms); - } + m_pipelineIdx = createPipeline(data.variant, data.assetMgr, sizeof(SamplerBenchPushConstants), joinName(data.name)); + } -private: - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_logger; - core::smart_refctd_ptr m_cmdpool; - core::smart_refctd_ptr m_benchmarkCmdbuf; - core::smart_refctd_ptr m_queryPool; - core::smart_refctd_ptr m_pplnLayout; - core::smart_refctd_ptr m_pipeline; - core::smart_refctd_ptr m_ds; - video::IDeviceMemoryAllocator::SAllocation m_inputAlloc = {}; - video::IDeviceMemoryAllocator::SAllocation m_outputAlloc = {}; - uint64_t m_outputAddress = 0; - video::IQueue* m_queue = nullptr; - video::IPhysicalDevice* m_physicalDevice = nullptr; - uint32_t m_dispatchGroupCount = 0; - uint32_t m_samplesPerDispatch = 0; - std::string m_executableReport; + void doRun() override + { + const PipelineEntry& pe = m_pipelines[m_pipelineIdx]; + SamplerBenchPushConstants pc = {}; + pc.outputAddress = m_outputAddress; + + const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(), + [&](video::IGPUCommandBuffer* cb) { defaultBindAndPush(cb, pe, pc); }, + [this](video::IGPUCommandBuffer* cb) { defaultDispatch(cb); }, + samplesForCurrentRow()); + + record(m_name, t, pe.stats); + } + + private: + core::smart_refctd_ptr m_outputBuf; + uint64_t m_outputAddress = 0; + uint32_t m_pipelineIdx = 0; }; #endif diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp index e0248d034..634e84123 100644 --- a/37_HLSLSamplingTests/main.cpp +++ b/37_HLSLSamplingTests/main.cpp @@ -1,5 +1,6 @@ #include +#include #include #include "nbl/examples/examples.hpp" @@ -191,149 +192,159 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ====================================================================== // GPU throughput benchmarks // ====================================================================== - constexpr uint32_t testBatchCount = 4096; + constexpr uint32_t benchWorkgroupsCount = 4096; constexpr bool DoBenchmark = true; if constexpr (DoBenchmark) { constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; - constexpr uint32_t totalThreadsPerDispatch = testBatchCount * benchWorkgroupSize; + constexpr uint32_t totalThreadsPerDispatch = benchWorkgroupsCount * benchWorkgroupSize; constexpr uint32_t iterationsPerThread = BENCH_ITERS; constexpr uint32_t benchSamplesPerDispatch = totalThreadsPerDispatch * iterationsPerThread; + constexpr uint32_t warmupDispatches = 300; // unmeasured warmup + cooldown around the timing window + constexpr uint64_t targetBudgetMs = 400; // wall-clock per row; runTimedBudgeted sizes dispatches + + std::vector benchmarks; + + // Single Aggregator owns results, baselines, formatting, and reporting + // for both bench classes. Passed by reference into each bench's ctor. + Aggregator agg(m_logger, m_device, m_physicalDevice, getComputeQueue()->getFamilyIndex()); + const auto cli = agg.applyCli({ + .argv = this->argv, + .defaultOutputPath = "SamplerBench.json", + .appName = "37_HLSLSamplingTests", + }); + + // One context for the whole sampler-bench span: drives both the per-bench + // shape/budget and the banner that runSessionAndReport prints. + const RunContext samplerCtx = { + .shape = { + .workgroupSize = {benchWorkgroupSize, 1u, 1u}, + .dispatchGroupCount = {benchWorkgroupsCount, 1u, 1u}, + .samplesPerDispatch = benchSamplesPerDispatch, + }, + .targetBudgetMs = targetBudgetMs, + .sectionLabel = "GPU Sampler Benchmarks", + }; - struct BenchEntry + auto addBench = [&](const std::initializer_list name, GPUBenchmarkHelper::ShaderVariant variant, size_t outputSize) { - CSamplerBenchmark bench; - std::string sampler; - std::string mode; + CSamplerBenchmark::SetupData data; + data.assetMgr = m_assetMgr; + data.name = name; + data.variant = std::move(variant); + data.outputBufferBytes = outputSize; + data.warmupDispatches = warmupDispatches; + data.shape = samplerCtx.shape; + data.targetBudgetMs = samplerCtx.targetBudgetMs; + + benchmarks.emplace_back(agg, data); }; - std::vector benchmarks; - auto addBench = [&](const char* sampler, const char* mode, const std::string& shaderKey, size_t inputSize, size_t outputSize) + // Convenience wrappers so the 35+ existing precompiled-key calls below stay + // one line each, and adding a new runtime variant is also a one-liner without + // CMake JSON edits. Both go through the same addBench, just construct the + // ShaderVariant differently. + auto addPrecompiled = [&](std::initializer_list name, size_t outputSize) { - auto& entry = benchmarks.emplace_back(); - entry.sampler = sampler; - entry.mode = mode; - - CSamplerBenchmark::SetupData data; - data.device = m_device; - data.api = m_api; - data.assetMgr = m_assetMgr; - data.logger = m_logger; - data.physicalDevice = m_physicalDevice; - data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = shaderKey; - data.dispatchGroupCount = testBatchCount; - data.samplesPerDispatch = benchSamplesPerDispatch; - data.inputBufferBytes = inputSize; - data.outputBufferBytes = outputSize; - entry.bench.setup(data); + auto shader = nbl::this_example::builtin::build::get_spirv_key(m_device.get()); + addBench(name, GPUBenchmarkHelper::ShaderVariant::Precompiled(std::move(shader)), outputSize); + }; + auto addRuntime = [&](std::initializer_list name, const char* sourcePath, std::vector defines, size_t outputSize) + { + // Mirror CMake's COMMON_OPTIONS so runtime variants see the same baseline + // as precompiled ones. + std::vector all = { + {"WORKGROUP_SIZE", std::to_string(WORKGROUP_SIZE)}, + {"BENCH_ITERS", std::to_string(BENCH_ITERS)}, + }; + all.insert(all.end(), std::make_move_iterator(defines.begin()), std::make_move_iterator(defines.end())); + addBench(name, GPUBenchmarkHelper::ShaderVariant::FromSource(sourcePath, std::move(all)), outputSize); }; - // Bench shaders don't read input (hardcoded values) and write a single uint32_t per thread via RWByteAddressBuffer + // Bench shaders don't read input -- output is BDA via push constants. if constexpr (true) { - constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch; - addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", "1:1 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", "1:1 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", "1:1 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + addPrecompiled.operator()<"linear_bench_1_1">({"Linear", "Linear", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"linear_bench_1_16">({"Linear", "Linear", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"bilinear_bench_1_1">({"Linear", "Bilinear", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"bilinear_bench_1_16">({"Linear", "Bilinear", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"box_muller_transform_bench_1_1">({"Gaussian", "BoxMullerTransform", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"box_muller_transform_bench_1_16">({"Gaussian", "BoxMullerTransform", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"uniform_hemisphere_bench_1_1">({"SphereSampling", "UniformHemisphere", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"uniform_hemisphere_bench_1_16">({"SphereSampling", "UniformHemisphere", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"uniform_sphere_bench_1_1">({"SphereSampling", "UniformSphere", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"uniform_sphere_bench_1_16">({"SphereSampling", "UniformSphere", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"projected_hemisphere_bench_1_1">({"SphereSampling", "ProjectedHemisphere", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"projected_hemisphere_bench_1_16">({"SphereSampling", "ProjectedHemisphere", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"projected_sphere_bench_1_1">({"SphereSampling", "ProjectedSphere", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"projected_sphere_bench_1_16">({"SphereSampling", "ProjectedSphere", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"concentric_mapping_bench_1_1">({"DiskMappers", "ConcentricMapping", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"concentric_mapping_bench_1_16">({"DiskMappers", "ConcentricMapping", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"polar_mapping_bench_1_1">({"DiskMappers", "PolarMapping", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"polar_mapping_bench_1_16">({"DiskMappers", "PolarMapping", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_1_shape_observer">({"SphShapes", "SphRect", "1:1", "shape,observer"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_1_sa_extents">({"SphShapes", "SphRect", "1:1", "sa,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_1_r0_extents">({"SphShapes", "SphRect", "1:1", "r0,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_16_shape_observer">({"SphShapes", "SphRect", "1:16", "shape,observer"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_16_sa_extents">({"SphShapes", "SphRect", "1:16", "sa,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_16_r0_extents">({"SphShapes", "SphRect", "1:16", "r0,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_create_only_shape_observer">({"SphShapes", "SphRect", "create-only", "shape,observer"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_create_only_sa_extents">({"SphShapes", "SphRect", "create-only", "sa,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_create_only_r0_extents">({"SphShapes", "SphRect", "create-only", "r0,extents"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_rectangle_bench_1_1">({"SphShapes", "ProjSphRect", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_rectangle_bench_1_16">({"SphShapes", "ProjSphRect", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_rectangle_bench_create_only">({"SphShapes", "ProjSphRect", "create-only"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_triangle_bench_1_1">({"SphShapes", "SphTri", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_triangle_bench_1_16">({"SphShapes", "SphTri", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_triangle_bench_create_only">({"SphShapes", "SphTri", "create-only"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_triangle_bench_1_1">({"SphShapes", "ProjSphTri", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_triangle_bench_1_16">({"SphShapes", "ProjSphTri", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_triangle_bench_create_only">({"SphShapes", "ProjSphTri", "create-only"}, benchOutputBytes); + // ---- Runtime-compiled demo variants (no CMake JSON edit needed) ---- + // Same .hlsl source as the precompiled "linear_bench_1_*" entries, but with + // a `BENCH_SAMPLES_PER_CREATE` value that has no JSON entry. Add as many + // here as you want -- each is a one-liner, no reconfigure required. + //addRuntime({"Linear", "Linear", "1:4 (rt)"}, "shaders/linear_test.comp.hlsl", {{"BENCH_SAMPLES_PER_CREATE", "4"}}, benchOutputBytes); + //addRuntime({"Linear", "Linear", "1:8 (rt)"}, "shaders/linear_test.comp.hlsl", {{"BENCH_SAMPLES_PER_CREATE", "8"}}, benchOutputBytes); } - // Print all pipeline reports first - for (auto& entry : benchmarks) - entry.bench.logPipelineReport(entry.sampler + " (" + entry.mode + ")"); - // Discrete sampler benchmark: alias table vs cumulative probability (BDA) { CDiscreteSamplerBenchmark::SetupData dsData; - dsData.device = m_device; - dsData.api = m_api; - dsData.assetMgr = m_assetMgr; - dsData.logger = m_logger; - dsData.physicalDevice = m_physicalDevice; - dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - dsData.packedAliasAShaderKey = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_bench">(m_device.get()); - dsData.packedAliasBShaderKey = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_bench">(m_device.get()); - dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); - dsData.cumProbYoloShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get()); - dsData.cumProbEytzingerShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_eytzinger_bench">(m_device.get()); - dsData.dispatchGroupCount = testBatchCount; - - CDiscreteSamplerBenchmark discreteBench; - discreteBench.setup(dsData); - - // Then run all benchmarks here so the reports are at the top of the log, followed by timings - { - constexpr uint32_t warmupDispatches = 300; - constexpr uint32_t benchDispatches = 1000; - m_logger->log("=== GPU Sampler Benchmarks (%u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===", - ILogger::ELL_PERFORMANCE, benchDispatches, totalThreadsPerDispatch, iterationsPerThread); - m_logger->log(" %-28s | %-38s | %12s | %12s | %12s", - ILogger::ELL_PERFORMANCE, "Sampler", "Mode", "ps/sample", "GSamples/s", "ms total"); - for (auto& entry : benchmarks) - entry.bench.run(entry.sampler, entry.mode, warmupDispatches, benchDispatches); - } - - { - // If you change something here, better change kBenchTable below too - const std::vector discreteSizes = { - 2u, 4u, 8u, 16u, 32u, 64u, 100u, 128u, 256u, 400u, 512u, 1024u, 2048u, 2049u, 3000u, 4096u, 7000u, 8192u, 10'000u, 16'384u, 32'768u, - 65'536u, 131'072u, 262'144u, 524'288u, 1'000'000u, 1'048'576u, 2'097'152u, 16'777'216u, 20'971'520u, 25'165'824u, 33'554'432u}; - - // Per-N dispatch counts calibrated from a prior measured run - auto dispatchScheduler = [](uint32_t N) -> CDiscreteSamplerBenchmark::DispatchCounts - { - static constexpr std::pair kBenchTable[] = { - {2u, 7180u}, {4u, 5993u}, {8u, 4490u}, {16u, 4099u}, {32u, 3110u}, {64u, 3026u}, {100u, 2507u}, {128u, 2498u}, {256u, 2477u}, {400u, 2001u}, - {512u, 1827u}, {1024u, 1372u}, {2048u, 1010u}, {2049u, 1010u}, {3000u, 859u}, {4096u, 962u}, {7000u, 742u}, {8192u, 833u}, {10'000u, 590u}, {16'384u, 786u}, {32'768u, 608u}, - {65'536u, 283u}, {131'072u, 174u}, {262'144u, 160u}, {524'288u, 133u}, {1'000'000u, 77u}, {1'048'576u, 128u}, {2'097'152u, 106u}, {16'777'216u, 17u}, {20'971'520u, 17u}, {25'165'824u, 16u}, {33'554'432u, 14u}}; - uint32_t bench = 10u; // fallback for any N not in the table - for (const auto& e : kBenchTable) - if (e.first == N) - { - bench = e.second; - break; - } - const uint32_t warmup = std::max(5u, bench / 10u); - return {warmup, bench}; - }; - - discreteBench.runSweep(discreteSizes, dispatchScheduler); - } + dsData.assetMgr = m_assetMgr; + dsData.packedAliasAVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_bench">(m_device.get())); + dsData.packedAliasBVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_bench">(m_device.get())); + dsData.cumProbVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get())); + dsData.cumProbYoloVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get())); + dsData.cumProbEytzingerVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_eytzinger_bench">(m_device.get())); + dsData.dispatchGroupCount = {benchWorkgroupsCount, 1u, 1u}; + dsData.targetBudgetMs = targetBudgetMs; + + // Just the N values now -- runTimedBudgeted sizes dispatches per + // row to hit the budget. The old per-N tuning table is gone. + static constexpr uint32_t kSweepNs[] = { + 2u, 4u, 8u, 16u, 32u, 64u, 100u, 128u, 256u, 400u, + 512u, 1024u, 2048u, 2049u, 3000u, 4096u, 7000u, 8192u, 10'000u, 16'384u, 32'768u, + 65'536u, 131'072u, 262'144u, 524'288u, 1'000'000u, 1'048'576u, 2'097'152u, 16'777'216u, 20'971'520u, 25'165'824u, 33'554'432u}; + dsData.sweepNs = kSweepNs; + + CDiscreteSamplerBenchmark discreteBench(agg, dsData); + + const RunContext discreteCtx = { + .shape = CDiscreteSamplerBenchmark::shapeFor(dsData), + .targetBudgetMs = targetBudgetMs, + .sectionLabel = "Discrete Sampler Sweep", + }; + + // Single call. Each span contributes its own focus rows first, then + // every span's unfocused rows -- the aggregator iterates both packs + // in each phase. CDiscrete's overridden run() does per-row filtering + // against cli.focusVariants since its rows aren't a flat list. + agg.runSessionAndReport( + Aggregator::makeSpan(benchmarks, samplerCtx), + Aggregator::makeSpan(discreteBench, discreteCtx)); } } @@ -341,9 +352,9 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Runtime CPU/GPU comparison tests using ITester harness // ================================================================ bool pass = true; - + constexpr uint32_t testWorkgroupsCount = 4096; // generic lambda to run a GPU sampler test - auto runSamplerTest = [&](const char* testName, auto spirvKey, const char* logFile) + auto runSamplerTest = [&](const char* testName, const char* logFile) { m_logger->log("Running %s tests...", ILogger::ELL_INFO, testName); typename Tester::PipelineSetupData data; @@ -353,8 +364,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat data.logger = m_logger; data.physicalDevice = m_physicalDevice; data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = std::move(spirvKey); - Tester tester(testBatchCount); + data.shaderKey = std::move(nbl::this_example::builtin::build::get_spirv_key(m_device.get())); + Tester tester(testWorkgroupsCount); tester.setupPipeline(data); pass &= tester.performTestsAndVerifyResults(logFile); }; @@ -362,22 +373,22 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // --- Sampler tests --- if constexpr (true) { - runSamplerTest.operator()("Linear sampler", nbl::this_example::builtin::build::get_spirv_key<"linear_test">(m_device.get()), "LinearTestLog.txt"); - runSamplerTest.operator()("Bilinear sampler", nbl::this_example::builtin::build::get_spirv_key<"bilinear_test">(m_device.get()), "BilinearTestLog.txt"); - runSamplerTest.operator()("UniformHemisphere sampler", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_test">(m_device.get()), "UniformHemisphereTestLog.txt"); - runSamplerTest.operator()("UniformSphere sampler", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_test">(m_device.get()), "UniformSphereTestLog.txt"); - runSamplerTest.operator()("ProjectedHemisphere sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_test">(m_device.get()), "ProjectedHemisphereTestLog.txt"); - runSamplerTest.operator()("ProjectedSphere sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_test">(m_device.get()), "ProjectedSphereTestLog.txt"); - runSamplerTest.operator()("ConcentricMapping sampler", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_test">(m_device.get()), "ConcentricMappingTestLog.txt"); - runSamplerTest.operator()("PolarMapping sampler", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_test">(m_device.get()), "PolarMappingTestLog.txt"); - runSamplerTest.operator()("BoxMullerTransform sampler", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_test">(m_device.get()), "BoxMullerTransformTestLog.txt"); - runSamplerTest.operator()("SphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle">(m_device.get()), "SphericalTriangleTestLog.txt"); - runSamplerTest.operator()("ProjectedSphericalTriangle sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_test">(m_device.get()), "ProjectedSphericalTriangleTestLog.txt"); - runSamplerTest.operator()("SphericalRectangle sampler", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_test">(m_device.get()), "SphericalRectangleTestLog.txt"); - runSamplerTest.operator()("ProjectedSphericalRectangle sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_test">(m_device.get()), "ProjectedSphericalRectangleTestLog.txt"); + runSamplerTest.operator()("Linear sampler", "LinearTestLog.txt"); + runSamplerTest.operator()("Bilinear sampler", "BilinearTestLog.txt"); + runSamplerTest.operator()("UniformHemisphere sampler", "UniformHemisphereTestLog.txt"); + runSamplerTest.operator()("UniformSphere sampler", "UniformSphereTestLog.txt"); + runSamplerTest.operator()("ProjectedHemisphere sampler", "ProjectedHemisphereTestLog.txt"); + runSamplerTest.operator()("ProjectedSphere sampler", "ProjectedSphereTestLog.txt"); + runSamplerTest.operator()("ConcentricMapping sampler", "ConcentricMappingTestLog.txt"); + runSamplerTest.operator()("PolarMapping sampler", "PolarMappingTestLog.txt"); + runSamplerTest.operator()("BoxMullerTransform sampler", "BoxMullerTransformTestLog.txt"); + runSamplerTest.operator()("SphericalTriangle", "SphericalTriangleTestLog.txt"); + runSamplerTest.operator()("ProjectedSphericalTriangle sampler", "ProjectedSphericalTriangleTestLog.txt"); + runSamplerTest.operator()("SphericalRectangle sampler", "SphericalRectangleTestLog.txt"); + runSamplerTest.operator()("ProjectedSphericalRectangle sampler", "ProjectedSphericalRectangleTestLog.txt"); } - if constexpr (DoBenchmark) + if constexpr (true) { // --- Discrete table construction (CPU) --- { @@ -387,9 +398,9 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat } // --- GPU table sampler tests --- - runSamplerTest.operator()("PackedAliasA GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_test">(m_device.get()), "PackedAliasATestLog.txt"); - runSamplerTest.operator()("PackedAliasB GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_test">(m_device.get()), "PackedAliasBTestLog.txt"); - runSamplerTest.operator()("CumulativeProbability GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_test">(m_device.get()), "CumulativeProbabilityTestLog.txt"); + runSamplerTest.operator()("PackedAliasA GPU sampler", "PackedAliasATestLog.txt"); + runSamplerTest.operator()("PackedAliasB GPU sampler", "PackedAliasBTestLog.txt"); + runSamplerTest.operator()("CumulativeProbability GPU sampler", "CumulativeProbabilityTestLog.txt"); } logJacobianSkipCounts(m_logger.get()); if (pass) @@ -405,61 +416,33 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat m_logger->log("Running sampler property tests (CPU)...", ILogger::ELL_INFO); m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING); - CSamplerPropertyTester linearProps(m_logger.get()); - pass &= linearProps.run(); - - CSamplerPropertyTester bilinearProps(m_logger.get()); - pass &= bilinearProps.run(); - - CSamplerPropertyTester uniformHemiProps(m_logger.get()); - pass &= uniformHemiProps.run(); - - CSamplerPropertyTester uniformSphereProps(m_logger.get()); - pass &= uniformSphereProps.run(); - - CSamplerPropertyTester projHemiProps(m_logger.get()); - pass &= projHemiProps.run(); - - CSamplerPropertyTester projSphereProps(m_logger.get()); - pass &= projSphereProps.run(); - - CSamplerPropertyTester concentricProps(m_logger.get()); - pass &= concentricProps.run(); - - CSamplerPropertyTester polarProps(m_logger.get()); - pass &= polarProps.run(); - - CSamplerPropertyTester boxMullerProps(m_logger.get()); - pass &= boxMullerProps.run(); - - CSamplerPropertyTester sphTriProps(m_logger.get()); - pass &= sphTriProps.run(); - - CSamplerPropertyTester projSphTriProps(m_logger.get()); - pass &= projSphTriProps.run(); - - CSamplerPropertyTester sphRectProps(m_logger.get()); - pass &= sphRectProps.run(); + auto check = [&]() + { + pass &= CSamplerPropertyTester(m_logger.get()).run(); + }; - CSamplerPropertyTester projSphRectProps(m_logger.get()); - pass &= projSphRectProps.run(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); // Stress tests: extreme coefficient ratios - CSamplerPropertyTester linearStress(m_logger.get()); - pass &= linearStress.run(); - - CSamplerPropertyTester bilinearStress(m_logger.get()); - pass &= bilinearStress.run(); - - CSamplerPropertyTester bilinearPST(m_logger.get()); - pass &= bilinearPST.run(); - - CSamplerPropertyTester sphTriStress(m_logger.get()); - pass &= sphTriStress.run(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); // Grazing angle tests - CSamplerPropertyTester grazingProps(m_logger.get()); - pass &= grazingProps.run(); + check.operator()(); if (pass) m_logger->log("All sampler property tests PASSED.", ILogger::ELL_INFO); @@ -475,23 +458,17 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat m_logger->log("Running geometry tests (CPU)...", ILogger::ELL_INFO); m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING); - CSolidAngleAccuracyTester solidAngleTester(m_logger.get()); - pass &= solidAngleTester.run(); - - CSphericalTriangleGenerateTester sphTriGenTester(m_logger.get()); - pass &= sphTriGenTester.run(); - - CSphericalRectangleGenerateTester sphRectGenTester(m_logger.get()); - pass &= sphRectGenTester.run(); - - CProjectedSphericalRectangleGenerateTester projRectGenTester(m_logger.get()); - pass &= projRectGenTester.run(); - - CProjectedSphericalRectangleGeometricTester projRectGeoTester(m_logger.get()); - pass &= projRectGeoTester.run(); + auto check = [&]() + { + pass &= Tester(m_logger.get()).run(); + }; - CProjectedSphericalTriangleGeometricTester pstTester(m_logger.get()); - pass &= pstTester.run(); + check.template operator()(); + check.template operator()(); + check.template operator()(); + check.template operator()(); + check.template operator()(); + check.template operator()(); if (pass) m_logger->log("All geometry tests PASSED.", ILogger::ELL_INFO); diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp index 8329c03b0..352e4e61f 100644 --- a/64_EmulatedFloatTest/main.cpp +++ b/64_EmulatedFloatTest/main.cpp @@ -6,6 +6,8 @@ #include "nbl/examples/examples.hpp" #include +#include +#include #include #include #include @@ -17,6 +19,8 @@ #include +#include "nbl/examples/Benchmark/IBenchmark.h" +#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h" using namespace nbl::core; using namespace nbl::hlsl; @@ -26,1136 +30,1029 @@ using namespace nbl::video; using namespace nbl::application_templates; using namespace nbl::examples; -constexpr bool DoTests = true; +constexpr bool DoTests = true; constexpr bool DoBenchmark = true; +// One row per EF64_BENCHMARK_MODE. Each instance owns its own write-sink +// buffer + descriptor set; the framework's GPUBenchmarkHelper handles +// cmdbuf / queryPool / pipeline-stats capture / runTimed timing, IBenchmark +// routes the result through the Aggregator. The shader binds an SSBO at +// set 0 / binding 0, so we pass an explicit dsLayout to createPipeline. +class CEF64Benchmark : public GPUBenchmark +{ + public: + static constexpr const char* kSectionLabel = "EF64 Benchmarks"; + + struct SetupData + { + smart_refctd_ptr assetMgr; + core::vector name; // hierarchical row name + EF64_BENCHMARK_MODE mode; // pushed each run() via PC + GPUBenchmarkHelper::ShaderVariant variant; // precompiled "benchmark" SPIRV + uint32_t warmupDispatches; + uint64_t targetBudgetMs; + }; + + // Shape is fixed by the BENCHMARK_WORKGROUP_* macros; expose it so the + // caller uses the same shape both to construct the bench and to build the + // RunContext for its span. + static WorkloadShape shape() + { + const hlsl::uint32_t3 wg = { + BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, + BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y, + BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z}; + const hlsl::uint32_t3 dgc = {BENCHMARK_WORKGROUP_COUNT, 1u, 1u}; + // Shader writes one float64 per thread per dispatch; "sample" == "thread output". + const uint64_t samplesPerDispatch = uint64_t(dgc.x) * dgc.y * dgc.z * wg.x * wg.y * wg.z; + return {.workgroupSize = wg, .dispatchGroupCount = dgc, .samplesPerDispatch = samplesPerDispatch}; + } + + CEF64Benchmark(Aggregator& aggregator, const SetupData& data) + : GPUBenchmark(aggregator, GPUBenchmark::SetupData{ + .name = data.name, + .warmupDispatches = data.warmupDispatches, + .shape = shape(), + .targetBudgetMs = data.targetBudgetMs, + }) + , m_mode(data.mode) + { + // Buffer the shader writes to (descriptor-bound; not BDA). Sized for one + // float64 per thread; the GPU never reads it back to host. + m_buffer = createOutputBuffer(getShape().samplesPerDispatch * sizeof(float64_t)); + + // One SSBO at set 0 / binding 0. createSingleBindingDS wires the + // layout + pool + DS + write descriptor in one call. + auto ds = createSingleBindingDS(m_buffer); + m_dsLayout = std::move(ds.layout); + m_ds = std::move(ds.set); + m_pipelineIdx = createPipeline(data.variant, data.assetMgr, sizeof(BenchmarkPushConstants), joinName(data.name), m_dsLayout); + } + + void doRun() override + { + const PipelineEntry& pe = m_pipelines[m_pipelineIdx]; + BenchmarkPushConstants pc = {}; + pc.benchmarkMode = m_mode; + + const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(), + [&](IGPUCommandBuffer* cb) + { + cb->bindDescriptorSets(EPBP_COMPUTE, pe.layout.get(), 0, 1, &m_ds.get()); + bindAndPush(cb, pe, pc); + }, + [this](IGPUCommandBuffer* cb) { dispatch(cb); }, + samplesForCurrentRow()); + + record(m_name, t, pe.stats); + } + + private: + EF64_BENCHMARK_MODE m_mode = EF64_BENCHMARK_MODE::NATIVE; + smart_refctd_ptr m_buffer; + smart_refctd_ptr m_dsLayout; + smart_refctd_ptr m_ds; + uint32_t m_pipelineIdx = 0; +}; + class CompatibilityTest final : public MonoDeviceApplication, public BuiltinResourcesApplication { - using device_base_t = MonoDeviceApplication; - using asset_base_t = BuiltinResourcesApplication; -public: - CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : - IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - - virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override - { - auto retval = device_base_t::getPreferredDeviceFeatures(); - retval.pipelineExecutableInfo = true; - return retval; - } - - bool onAppInitialized(smart_refctd_ptr&& system) override - { - // since emulated_float64_t rounds to zero - std::fesetround(FE_TOWARDZERO); - - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - if (!asset_base_t::onAppInitialized(std::move(system))) - return false; - - return true; - } - - void onAppTerminated_impl() override - { - m_device->waitIdle(); - } - - void workLoopBody() override - { - if constexpr (DoTests) - { - emulated_float64_tests(); - } - if constexpr (DoBenchmark) - { - EF64Benchmark benchmark(*this); - benchmark.run(); - } - - m_keepRunning = false; - } - - bool keepRunning() override - { - return m_keepRunning; - } - - -private: - - bool m_keepRunning = true; - - constexpr static inline uint32_t EmulatedFloat64TestIterations = 1000u; - - enum class EmulatedFloatTestDevice - { - CPU, - GPU - }; - - template - bool compareEmulatedFloat64TestValues(const TestValues& expectedValues, const TestValues& testValues) - { - bool success = true; - - auto printOnFailure = [this](EmulatedFloatTestDevice device) - { - std::string errorMsgPrefix = ""; - if (device == EmulatedFloatTestDevice::CPU) - errorMsgPrefix = "CPU test fail:"; - else - errorMsgPrefix = "GPU test fail:"; - - m_logger->log("%s", ILogger::ELL_ERROR, errorMsgPrefix.c_str()); - m_logFile << errorMsgPrefix << '\n'; - }; - - auto printOnArithmeticFailure = [this](const char* valName, uint64_t expectedValue, uint64_t testValue, uint64_t a, uint64_t b) - { - double expectedAsDouble = reinterpret_cast(expectedValue); - double testAsDouble = reinterpret_cast(testValue); - double error = std::abs(expectedAsDouble - testAsDouble); - - std::stringstream ss; - ss << "for input values: A = " << reinterpret_cast(a) << " B = " << reinterpret_cast(b) << '\n'; - ss << valName << " not equal!"; - ss << "\nexpected value: " << std::fixed << std::setprecision(20) << expectedAsDouble; - ss << "\ntest value: " << std::fixed << std::setprecision(20) << testAsDouble; - ss << "\nerror = " << error << '\n'; - ss << "bit representations: \n"; - ss << "seeeeeeeeeeemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n"; - ss << std::bitset<64>(expectedValue) << " - expectedValue bit pattern\n"; - ss << std::bitset<64>(testValue) << " - testValue bit pattern \n"; - - m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str()); - m_logFile << ss.str() << '\n'; - - //std::cout << "ULP error: " << std::max(expectedValue, testValue) - std::min(expectedValue, testValue) << "\n\n"; - - }; - - auto calcULPError = [](emulated_float64_t::storage_t expectedValue, emulated_float64_t::storage_t testValue) - { - return std::max(expectedValue, testValue) - std::min(expectedValue, testValue); - }; - - auto printOnComparisonFailure = [this](const char* valName, int expectedValue, int testValue, double a, double b) - { - std::string inputValuesStr = std::string("for input values: A = ") + std::to_string(a) + std::string(" B = ") + std::to_string(b); - - m_logger->log("%s", ILogger::ELL_ERROR, inputValuesStr.c_str()); - m_logFile << inputValuesStr << '\n'; - - std::stringstream ss; - ss << valName << " not equal!"; - ss << "\nexpected value: " << std::boolalpha << bool(expectedValue); - ss << "\ntest value: " << std::boolalpha << bool(testValue); - - m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str()); - m_logFile << ss.str() << '\n'; - }; - - if (calcULPError(expectedValues.int32CreateVal, testValues.int32CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("int32CreateVal", expectedValues.int32CreateVal, testValues.int32CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.int64CreateVal, testValues.int64CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("int64CreateVal", expectedValues.int64CreateVal, testValues.int64CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.uint32CreateVal, testValues.uint32CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("uint32CreateVal", expectedValues.uint32CreateVal, testValues.uint32CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.uint64CreateVal, testValues.uint64CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("uint64CreateVal", expectedValues.uint64CreateVal, testValues.uint64CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.float32CreateVal, testValues.float32CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("float32CreateVal", expectedValues.float32CreateVal, testValues.float32CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.float64CreateVal != testValues.float64CreateVal) - { - printOnFailure(Device); - printOnArithmeticFailure("float64CreateVal", expectedValues.float64CreateVal, testValues.float64CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.additionVal, testValues.additionVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("additionVal", expectedValues.additionVal, testValues.additionVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.substractionVal, testValues.substractionVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("substractionVal", expectedValues.substractionVal, testValues.substractionVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.multiplicationVal, testValues.multiplicationVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("multiplicationVal", expectedValues.multiplicationVal, testValues.multiplicationVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.divisionVal, testValues.divisionVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("divisionVal", expectedValues.divisionVal, testValues.divisionVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.lessOrEqualVal != testValues.lessOrEqualVal) - { - printOnFailure(Device); - printOnComparisonFailure("lessOrEqualVal", expectedValues.lessOrEqualVal, testValues.lessOrEqualVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.greaterOrEqualVal != testValues.greaterOrEqualVal) - { - printOnFailure(Device); - printOnComparisonFailure("greaterOrEqualVal", expectedValues.greaterOrEqualVal, testValues.greaterOrEqualVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.equalVal != testValues.equalVal) - { - printOnFailure(Device); - printOnComparisonFailure("equalVal", expectedValues.equalVal, testValues.equalVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.notEqualVal != testValues.notEqualVal) - { - printOnFailure(Device); - printOnComparisonFailure("notEqualVal", expectedValues.notEqualVal, testValues.notEqualVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.lessVal != testValues.lessVal) - { - printOnFailure(Device); - printOnComparisonFailure("lessVal", expectedValues.lessVal, testValues.lessVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.greaterVal != testValues.greaterVal) - { - printOnFailure(Device); - printOnComparisonFailure("greaterVal", expectedValues.greaterVal, testValues.greaterVal, expectedValues.a, expectedValues.b); - success = false; - } - - return success; - }; - - class EF64Submitter - { - public: - EF64Submitter(CompatibilityTest& base) - :m_base(base), m_pushConstants({}), m_semaphoreCounter(0) - { - // setting up pipeline in the constructor - m_queueFamily = base.getComputeQueue()->getFamilyIndex(); - m_semaphore = base.m_device->createSemaphore(0); - m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) - base.logFail("Failed to create Command Buffers!\n"); - - // Load shaders, set up pipeline + using device_base_t = MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + + public: + CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override + { + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.pipelineExecutableInfo = true; + return retval; + } + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // since emulated_float64_t rounds to zero + std::fesetround(FE_TOWARDZERO); + + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + return true; + } + + void onAppTerminated_impl() override + { + m_device->waitIdle(); + } + + void workLoopBody() override + { + if constexpr (DoTests) + { + emulated_float64_tests(); + } + if constexpr (DoBenchmark) + { + runEF64Benchmarks(); + } + + m_keepRunning = false; + } + + bool keepRunning() override + { + return m_keepRunning; + } + + + private: + bool m_keepRunning = true; + + constexpr static inline uint32_t EmulatedFloat64TestIterations = 1000u; + + enum class EmulatedFloatTestDevice + { + CPU, + GPU + }; + + template + bool compareEmulatedFloat64TestValues(const TestValues& expectedValues, const TestValues& testValues) + { + bool success = true; + + auto printOnFailure = [this](EmulatedFloatTestDevice device) + { + std::string errorMsgPrefix = ""; + if (device == EmulatedFloatTestDevice::CPU) + errorMsgPrefix = "CPU test fail:"; + else + errorMsgPrefix = "GPU test fail:"; + + m_logger->log("%s", ILogger::ELL_ERROR, errorMsgPrefix.c_str()); + m_logFile << errorMsgPrefix << '\n'; + }; + + auto printOnArithmeticFailure = [this](const char* valName, uint64_t expectedValue, uint64_t testValue, uint64_t a, uint64_t b) + { + double expectedAsDouble = reinterpret_cast(expectedValue); + double testAsDouble = reinterpret_cast(testValue); + double error = std::abs(expectedAsDouble - testAsDouble); + + std::stringstream ss; + ss << "for input values: A = " << reinterpret_cast(a) << " B = " << reinterpret_cast(b) << '\n'; + ss << valName << " not equal!"; + ss << "\nexpected value: " << std::fixed << std::setprecision(20) << expectedAsDouble; + ss << "\ntest value: " << std::fixed << std::setprecision(20) << testAsDouble; + ss << "\nerror = " << error << '\n'; + ss << "bit representations: \n"; + ss << "seeeeeeeeeeemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n"; + ss << std::bitset<64>(expectedValue) << " - expectedValue bit pattern\n"; + ss << std::bitset<64>(testValue) << " - testValue bit pattern \n"; + + m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str()); + m_logFile << ss.str() << '\n'; + + //std::cout << "ULP error: " << std::max(expectedValue, testValue) - std::min(expectedValue, testValue) << "\n\n"; + }; + + auto calcULPError = [](emulated_float64_t::storage_t expectedValue, emulated_float64_t::storage_t testValue) + { + return std::max(expectedValue, testValue) - std::min(expectedValue, testValue); + }; + + auto printOnComparisonFailure = [this](const char* valName, int expectedValue, int testValue, double a, double b) + { + std::string inputValuesStr = std::string("for input values: A = ") + std::to_string(a) + std::string(" B = ") + std::to_string(b); + + m_logger->log("%s", ILogger::ELL_ERROR, inputValuesStr.c_str()); + m_logFile << inputValuesStr << '\n'; + + std::stringstream ss; + ss << valName << " not equal!"; + ss << "\nexpected value: " << std::boolalpha << bool(expectedValue); + ss << "\ntest value: " << std::boolalpha << bool(testValue); + + m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str()); + m_logFile << ss.str() << '\n'; + }; + + if (calcULPError(expectedValues.int32CreateVal, testValues.int32CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("int32CreateVal", expectedValues.int32CreateVal, testValues.int32CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.int64CreateVal, testValues.int64CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("int64CreateVal", expectedValues.int64CreateVal, testValues.int64CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.uint32CreateVal, testValues.uint32CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("uint32CreateVal", expectedValues.uint32CreateVal, testValues.uint32CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.uint64CreateVal, testValues.uint64CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("uint64CreateVal", expectedValues.uint64CreateVal, testValues.uint64CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.float32CreateVal, testValues.float32CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("float32CreateVal", expectedValues.float32CreateVal, testValues.float32CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.float64CreateVal != testValues.float64CreateVal) + { + printOnFailure(Device); + printOnArithmeticFailure("float64CreateVal", expectedValues.float64CreateVal, testValues.float64CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.additionVal, testValues.additionVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("additionVal", expectedValues.additionVal, testValues.additionVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.substractionVal, testValues.substractionVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("substractionVal", expectedValues.substractionVal, testValues.substractionVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.multiplicationVal, testValues.multiplicationVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("multiplicationVal", expectedValues.multiplicationVal, testValues.multiplicationVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.divisionVal, testValues.divisionVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("divisionVal", expectedValues.divisionVal, testValues.divisionVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.lessOrEqualVal != testValues.lessOrEqualVal) + { + printOnFailure(Device); + printOnComparisonFailure("lessOrEqualVal", expectedValues.lessOrEqualVal, testValues.lessOrEqualVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.greaterOrEqualVal != testValues.greaterOrEqualVal) + { + printOnFailure(Device); + printOnComparisonFailure("greaterOrEqualVal", expectedValues.greaterOrEqualVal, testValues.greaterOrEqualVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.equalVal != testValues.equalVal) + { + printOnFailure(Device); + printOnComparisonFailure("equalVal", expectedValues.equalVal, testValues.equalVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.notEqualVal != testValues.notEqualVal) + { + printOnFailure(Device); + printOnComparisonFailure("notEqualVal", expectedValues.notEqualVal, testValues.notEqualVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.lessVal != testValues.lessVal) + { + printOnFailure(Device); + printOnComparisonFailure("lessVal", expectedValues.lessVal, testValues.lessVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.greaterVal != testValues.greaterVal) + { + printOnFailure(Device); + printOnComparisonFailure("greaterVal", expectedValues.greaterVal, testValues.greaterVal, expectedValues.a, expectedValues.b); + success = false; + } + + return success; + }; + + class EF64Submitter + { + public: + EF64Submitter(CompatibilityTest& base) + : m_base(base), m_pushConstants({}), m_semaphoreCounter(0) + { + // setting up pipeline in the constructor + m_queueFamily = base.getComputeQueue()->getFamilyIndex(); + m_semaphore = base.m_device->createSemaphore(0); + m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) + base.logFail("Failed to create Command Buffers!\n"); + + // Load shaders, set up pipeline + { + smart_refctd_ptr shader; { - smart_refctd_ptr shader; - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = base.m_logger.get(); - lp.workingDirectory = "app_resources"; // virtual root - - auto key = nbl::this_example::builtin::build::get_spirv_key<"test">(base.m_device.get()); - auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - base.logFail("Could not load shader!"); - assert(0); - } - - // It would be super weird if loading a shader from a file produced more than 1 asset - assert(assets.size() == 1); - shader = IAsset::castDown(assets[0]); - } - - if (!shader) - base.logFail("Failed to load precompiled \"test\" shader!\n"); - - nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { - { - .binding = 0, - .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, - .count = 1 - } - }; - smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); - if (!dsLayout) - base.logFail("Failed to create a Descriptor Layout!\n"); - - SPushConstantRange pushConstantRanges[] = { - { - .stageFlags = ShaderStage::ESS_COMPUTE, - .offset = 0, - .size = sizeof(PushConstants) - } - }; - m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); - if (!m_pplnLayout) - base.logFail("Failed to create a Pipeline Layout!\n"); - - { - IGPUComputePipeline::SCreationParams params = {}; - params.layout = m_pplnLayout.get(); - params.shader.entryPoint = "main"; - params.shader.shader = shader.get(); - if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) - { - params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS; - params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - if (!base.m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) - base.logFail("Failed to create pipelines (compile & link shaders)!\n"); - - if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_pipeline->getExecutableInfo()); - base.m_logger->log("EF64Submitter Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, report.c_str()); - } - } - - // Allocate the memory - { - constexpr size_t BufferSize = sizeof(TestValues); - - nbl::video::IGPUBuffer::SCreationParams params = {}; - params.size = BufferSize; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - smart_refctd_ptr outputBuff = base.m_device->createBuffer(std::move(params)); - if (!outputBuff) - base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - - outputBuff->setObjectDebugName("emulated_float64_t output buffer"); - - nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs(); - reqs.memoryTypeBits &= base.m_physicalDevice->getHostVisibleMemoryTypeBits(); - - m_allocation = base.m_device->allocate(reqs, outputBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); - if (!m_allocation.isValid()) - base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - - assert(outputBuff->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); - - m_ds = pool->createDescriptorSet(std::move(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo info[1]; - info[0].desc = smart_refctd_ptr(outputBuff); - info[0].info.buffer = { .offset = 0,.size = BufferSize }; - IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info} - }; - base.m_device->updateDescriptorSets(writes, {}); - } - } - - if (!m_allocation.memory->map({ 0ull,m_allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ)) - base.logFail("Failed to map the Device Memory!\n"); + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = base.m_logger.get(); + lp.workingDirectory = "app_resources"; // virtual root + + auto key = nbl::this_example::builtin::build::get_spirv_key<"test">(base.m_device.get()); + auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + base.logFail("Could not load shader!"); + assert(0); + } + + // It would be super weird if loading a shader from a file produced more than 1 asset + assert(assets.size() == 1); + shader = IAsset::castDown(assets[0]); } - // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches - const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize()); - if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - base.m_device->invalidateMappedMemoryRanges(1, &memoryRange); - - assert(memoryRange.valid() && memoryRange.length >= sizeof(TestValues)); - - m_queue = m_base.m_device->getQueue(m_queueFamily, 0); - } - - ~EF64Submitter() - { - m_allocation.memory->unmap(); - } - - void setPushConstants(PushConstants& pc) - { - m_pushConstants = pc; - } - - TestValues submitGetGPUTestValues() - { - // record command buffer - m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - m_cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); - m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1)); - m_cmdbuf->bindComputePipeline(m_pipeline.get()); - m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); - m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstants), &m_pushConstants); - m_cmdbuf->dispatch(WORKGROUP_SIZE, 1, 1); - m_cmdbuf->endDebugMarker(); - m_cmdbuf->end(); - - IQueue::SSubmitInfo submitInfos[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()}}; - submitInfos[0].commandBuffers = cmdbufs; - const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = m_semaphore.get(), .value = ++m_semaphoreCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - submitInfos[0].signalSemaphores = signals; - - m_base.m_api->startCapture(); - m_queue->submit(submitInfos); - m_base.m_api->endCapture(); - - m_base.m_device->waitIdle(); - TestValues output; - std::memcpy(&output, static_cast*>(m_allocation.memory->getMappedPointer()), sizeof(TestValues)); - m_base.m_device->waitIdle(); - - return output; - } - - private: - uint32_t m_queueFamily; - nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; - smart_refctd_ptr m_cmdbuf = nullptr; - smart_refctd_ptr m_cmdpool = nullptr; - smart_refctd_ptr m_ds = nullptr; - smart_refctd_ptr m_pplnLayout = nullptr; - PushConstants m_pushConstants; - CompatibilityTest& m_base; - smart_refctd_ptr m_pipeline; - smart_refctd_ptr m_semaphore; - IQueue* m_queue; - uint64_t m_semaphoreCounter; - }; - - void emulated_float64_tests() - { - EF64Submitter submitter(*this); - - auto printTestOutput = [this](const std::string& functionName, const EmulatedFloat64TestOutput& testResult) - { - std::cout << functionName << ": " << std::endl; - - if (!testResult.cpuTestsSucceed) - logFail("Incorrect CPU determinated values!"); - else - m_logger->log("Correct CPU determinated values!", ILogger::ELL_PERFORMANCE); - - if (!testResult.gpuTestsSucceed) - logFail("Incorrect GPU determinated values!"); - else - m_logger->log("Correct GPU determinated values!", ILogger::ELL_PERFORMANCE); - }; - - m_logFile.open("EmulatedFloatTestLog.txt", std::ios::out | std::ios::trunc); - if (!m_logFile.is_open()) - m_logger->log("Failed to open log file!", system::ILogger::ELL_ERROR); - - printTestOutput("emulatedFloat64RandomValuesTest", emulatedFloat64RandomValuesTest(submitter)); - printTestOutput("emulatedFloat64RandomValuesTestContrastingExponents", emulatedFloat64RandomValuesTestContrastingExponents(submitter)); - printTestOutput("emulatedFloat64NegAndPosZeroTest", emulatedFloat64NegAndPosZeroTest(submitter)); - printTestOutput("emulatedFloat64BothValuesInfTest", emulatedFloat64BothValuesInfTest(submitter)); - printTestOutput("emulatedFloat64BothValuesNegInfTest", emulatedFloat64BothValuesNegInfTest(submitter)); - printTestOutput("emulatedFloat64OneValIsInfOtherIsNegInfTest", emulatedFloat64OneValIsInfOtherIsNegInfTest(submitter)); - printTestOutput("emulatedFloat64OneValIsInfTest", emulatedFloat64OneValIsInfTest(submitter)); - printTestOutput("emulatedFloat64OneValIsNegInfTest", emulatedFloat64OneValIsNegInfTest(submitter)); - if(false) // doesn't work for some reason + fast math is enabled by default - printTestOutput("emulatedFloat64BNaNTest", emulatedFloat64BNaNTest(submitter)); - printTestOutput("emulatedFloat64BInfTest", emulatedFloat64OneValIsZeroTest(submitter)); - printTestOutput("emulatedFloat64BNegInfTest", emulatedFloat64OneValIsNegZeroTest(submitter)); - - m_logFile.close(); - } - - template - struct EmulatedFloat64TestValuesInfo - { - emulated_float64_t a; - emulated_float64_t b; - ConstructorTestValues constrTestValues; - TestValues expectedTestValues; - - void fillExpectedTestValues() - { - double aAsDouble = reinterpret_cast(a); - double bAsDouble = reinterpret_cast(b); - - expectedTestValues.a = a.data; - expectedTestValues.b = b.data; - - expectedTestValues.int32CreateVal = bit_cast(double(constrTestValues.int32)); - expectedTestValues.int64CreateVal = bit_cast(double(constrTestValues.int64)); - expectedTestValues.uint32CreateVal = bit_cast(double(constrTestValues.uint32)); - expectedTestValues.uint64CreateVal = bit_cast(double(constrTestValues.uint64)); - expectedTestValues.float32CreateVal = bit_cast(double(constrTestValues.float32)); - expectedTestValues.float64CreateVal = bit_cast(constrTestValues.float64); - expectedTestValues.additionVal = emulated_float64_t::create(aAsDouble + bAsDouble).data; - expectedTestValues.substractionVal = emulated_float64_t::create(aAsDouble - bAsDouble).data; - expectedTestValues.multiplicationVal = emulated_float64_t::create(aAsDouble * bAsDouble).data; - expectedTestValues.divisionVal = emulated_float64_t::create(aAsDouble / bAsDouble).data; - expectedTestValues.lessOrEqualVal = aAsDouble <= bAsDouble; - expectedTestValues.greaterOrEqualVal = aAsDouble >= bAsDouble; - expectedTestValues.equalVal = aAsDouble == bAsDouble; - expectedTestValues.notEqualVal = aAsDouble != bAsDouble; - expectedTestValues.lessVal = aAsDouble < bAsDouble; - expectedTestValues.greaterVal = aAsDouble > bAsDouble; - } - }; - - struct EmulatedFloat64TestOutput - { - bool cpuTestsSucceed; - bool gpuTestsSucceed; - }; - - EmulatedFloat64TestOutput emulatedFloat64LoopedTests_impl(EF64Submitter& submitter, - const uint32_t iterations, - const std::function& determineValueA, - const std::function& determineValueB) - { - EmulatedFloat64TestOutput output = { true, true }; - - std::uniform_int_distribution i32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution i64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution u32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution u64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_real_distribution fDistribution(-100000.0, 100000.0); - - std::random_device rd; - std::mt19937 mt(rd()); - - for (uint32_t i = 0u; i < iterations; ++i) - { - // generate random test values - EmulatedFloat64TestValuesInfo testValInfo; - double aTmp = determineValueA(); - double bTmp = determineValueB(); - testValInfo.a.data = reinterpret_cast::storage_t&>(aTmp); - testValInfo.b.data = reinterpret_cast::storage_t&>(bTmp); - testValInfo.constrTestValues.int32 = i32Distribution(mt); - testValInfo.constrTestValues.int64 = i64Distribution(mt); - testValInfo.constrTestValues.uint32 = u32Distribution(mt); - testValInfo.constrTestValues.uint64 = u64Distribution(mt); - testValInfo.constrTestValues.float32 = fDistribution(mt); - testValInfo.constrTestValues.float64 = fDistribution(mt); - - testValInfo.fillExpectedTestValues(); - auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - - if (!singleTestOutput.cpuTestsSucceed) - output.cpuTestsSucceed = false; - if (!singleTestOutput.gpuTestsSucceed) - output.gpuTestsSucceed = false; - } - - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64RandomValuesTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - - - return distribution(mt); - }; - - return emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations, getRandomFloat64, getRandomFloat64); - } - - EmulatedFloat64TestOutput emulatedFloat64RandomValuesTestContrastingExponents(EF64Submitter& submitter) - { - auto getRandomSmallFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-0.01, 0.01); - - return distribution(mt); - }; - - auto getRandomLargeFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(1000000000.0, 2000000000.0); - static std::uniform_int_distribution coinFlipDistribution(0, 1); - - double output = distribution(mt); - if (coinFlipDistribution(mt)) - output = -output; - - return output; - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomSmallFloat64, getRandomLargeFloat64); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomLargeFloat64, getRandomSmallFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64BothValuesNaNTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - const float32_t nan32 = std::numeric_limits::quiet_NaN(); - const float64_t nan64 = std::numeric_limits::quiet_NaN(); - testValInfo.a = emulated_float64_t::create(nan64); - testValInfo.b = emulated_float64_t::create(nan64); - testValInfo.constrTestValues = { - .int32 = std::bit_cast(nan32), - .int64 = std::bit_cast(nan64), - .uint32 = std::bit_cast(nan32), - .uint64 = std::bit_cast(nan64), - .float32 = nan32 - //.float64 = nan64 - }; - - testValInfo.fillExpectedTestValues(); - return performEmulatedFloat64Tests(testValInfo, submitter); - } - - EmulatedFloat64TestOutput emulatedFloat64NegAndPosZeroTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - testValInfo.a = emulated_float64_t::create(ieee754::traits::signMask); - testValInfo.b = emulated_float64_t::create(std::bit_cast(0.0)); - testValInfo.constrTestValues = { - .int32 = 0, - .int64 = 0, - .uint32 = 0, - .uint64 = 0, - .float32 = 0 - }; - - testValInfo.fillExpectedTestValues(); - auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - std::swap(testValInfo.a, testValInfo.b); - testValInfo.fillExpectedTestValues(); - auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - - return { firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed }; - } - - EmulatedFloat64TestOutput emulatedFloat64BothValuesInfTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - const float32_t inf32 = std::numeric_limits::infinity(); - const float64_t inf64 = std::numeric_limits::infinity(); - testValInfo.a = emulated_float64_t::create(inf64); - testValInfo.b = emulated_float64_t::create(inf64); - testValInfo.constrTestValues = { - .int32 = 0, - .int64 = 0, - .uint32 = 0, - .uint64 = 0, - .float32 = inf32 - //.float64 = inf64 - }; - - testValInfo.fillExpectedTestValues(); - return performEmulatedFloat64Tests(testValInfo, submitter); - } - - EmulatedFloat64TestOutput emulatedFloat64BothValuesNegInfTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - const float32_t inf32 = -std::numeric_limits::infinity(); - const float64_t inf64 = -std::numeric_limits::infinity(); - testValInfo.a = emulated_float64_t::create(inf64); - testValInfo.b = emulated_float64_t::create(inf64); - testValInfo.constrTestValues = { - .int32 = 0, - .int64 = 0, - .uint32 = 0, - .uint64 = 0, - .float32 = inf32 - //.float64 = inf64 - }; - - testValInfo.fillExpectedTestValues(); - return performEmulatedFloat64Tests(testValInfo, submitter); - } - - EmulatedFloat64TestOutput emulatedFloat64OneValIsInfOtherIsNegInfTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - const float64_t inf64 = -std::numeric_limits::infinity(); - testValInfo.a = emulated_float64_t::create(inf64); - testValInfo.b = emulated_float64_t::create(inf64); - testValInfo.constrTestValues = { - .int32 = 0, - .int64 = 0, - .uint32 = 0, - .uint64 = 0, - .float32 = 0 - //.float64 = inf64 - }; - - testValInfo.fillExpectedTestValues(); - auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - std::swap(testValInfo.a, testValInfo.b); - testValInfo.fillExpectedTestValues(); - auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - - return { firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed }; - } - - // TODO: fix - EmulatedFloat64TestOutput emulatedFloat64BNaNTest(EF64Submitter& submitter) - { - EmulatedFloat64TestOutput output = { true, true }; - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - for (uint32_t i = 0u; i < EmulatedFloat64TestIterations; ++i) - { - std::random_device rd; - std::mt19937 mt(rd()); - - std::uniform_int_distribution i32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution i64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution u32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution u64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_real_distribution f32Distribution(-100000.0f, 100000.0f); - std::uniform_real_distribution f64Distribution(-100000.0, 100000.0); - - EmulatedFloat64TestValuesInfo testValInfo; - double aTmp = f64Distribution(mt); - double bTmp = std::numeric_limits::quiet_NaN(); - testValInfo.a.data = reinterpret_cast::storage_t&>(aTmp); - testValInfo.b.data = reinterpret_cast::storage_t&>(bTmp); - testValInfo.constrTestValues.int32 = i32Distribution(mt); - testValInfo.constrTestValues.int64 = i64Distribution(mt); - testValInfo.constrTestValues.uint32 = u32Distribution(mt); - testValInfo.constrTestValues.uint64 = u64Distribution(mt); - testValInfo.constrTestValues.float32 = f32Distribution(mt); - //testValInfo.constrTestValues.float64 = f64Distribution(mt); - - testValInfo.fillExpectedTestValues(); - auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - - if (!singleTestOutput.cpuTestsSucceed) - output.cpuTestsSucceed = false; - if (!singleTestOutput.gpuTestsSucceed) - output.gpuTestsSucceed = false; - } - - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64OneValIsInfTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - - return distribution(mt); - }; - - auto getInfinity = []() - { - return std::numeric_limits::infinity(); - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getInfinity); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getInfinity, getRandomFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } + if (!shader) + base.logFail("Failed to load precompiled \"test\" shader!\n"); + + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { + {.binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_COMPUTE, + .count = 1}}; + smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); + if (!dsLayout) + base.logFail("Failed to create a Descriptor Layout!\n"); + + SPushConstantRange pushConstantRanges[] = { + {.stageFlags = ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PushConstants)}}; + m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); + if (!m_pplnLayout) + base.logFail("Failed to create a Pipeline Layout!\n"); - EmulatedFloat64TestOutput emulatedFloat64OneValIsNegInfTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - - - return distribution(mt); - }; - - auto getNegInfinity = []() - { - return -std::numeric_limits::infinity(); - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegInfinity); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegInfinity, getRandomFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64OneValIsZeroTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - - return distribution(mt); - }; - - auto getZero = []() - { - return 0.0; - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getZero); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getZero, getRandomFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64OneValIsNegZeroTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - - return distribution(mt); - }; + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pplnLayout.get(); + params.shader.entryPoint = "main"; + params.shader.shader = shader.get(); + if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) + { + params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS; + params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + } + if (!base.m_device->createComputePipelines(nullptr, {¶ms, 1}, &m_pipeline)) + base.logFail("Failed to create pipelines (compile & link shaders)!\n"); + + if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) + { + auto report = system::to_string(m_pipeline->getExecutableInfo()); + base.m_logger->log("EF64Submitter Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, report.c_str()); + } + } - auto getNegZero = []() - { - return -0.0; - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegZero); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegZero, getRandomFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } - - template - EmulatedFloat64TestOutput performEmulatedFloat64Tests(EmulatedFloat64TestValuesInfo& testValInfo, EF64Submitter& submitter) - { - emulated_float64_t a = testValInfo.a; - emulated_float64_t b = testValInfo.b; - - const TestValues cpuTestValues = { - .int32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.int32).data, - .int64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.int64).data, - .uint32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.uint32).data, - .uint64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.uint64).data, - .float32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.float32).data, - .float64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.float64).data, - .additionVal = (a + b).data, - .substractionVal = (a - b).data, - .multiplicationVal = (a * b).data, - .divisionVal = (a / b).data, - .lessOrEqualVal = a <= b, - .greaterOrEqualVal = a >= b, - .equalVal = a == b, - .notEqualVal = a != b, - .lessVal = a < b, - .greaterVal = a > b - }; - - EmulatedFloat64TestOutput output; - - // cpu validation - output.cpuTestsSucceed = compareEmulatedFloat64TestValues(testValInfo.expectedTestValues, cpuTestValues); - - // gpu validation - PushConstants pc; - pc.a = reinterpret_cast(a); - pc.b = reinterpret_cast(b); - pc.constrTestVals = testValInfo.constrTestValues; - - submitter.setPushConstants(pc); - auto gpuTestValues = submitter.submitGetGPUTestValues(); - - output.gpuTestsSucceed = compareEmulatedFloat64TestValues(testValInfo.expectedTestValues, gpuTestValues); - - return output; - } - - class EF64Benchmark final - { - public: - EF64Benchmark(CompatibilityTest& base) - { - m_device = base.m_device; - m_logger = base.m_logger; - m_api = base.m_api; - - // setting up pipeline in the constructor - m_queueFamily = base.getComputeQueue()->getFamilyIndex(); - m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) - base.logFail("Failed to create Command Buffers!\n"); - - // Load shaders, set up pipeline + // Allocate the memory { - smart_refctd_ptr shader; - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = base.m_logger.get(); - lp.workingDirectory = "app_resources"; // virtual root - // this time we load a shader directly from a file - auto key = nbl::this_example::builtin::build::get_spirv_key<"benchmark">(m_device.get()); - auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - base.logFail("Could not load shader!"); - assert(0); - } - - // It would be super weird if loading a shader from a file produced more than 1 asset - assert(assets.size() == 1); - shader = IAsset::castDown(assets[0]); - } - - if (!shader) - base.logFail("Failed to load precompiled \"benchmark\" shader!\n"); - - nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { - { - .binding = 0, - .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, - .count = 1 - } - }; - smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); - if (!dsLayout) - base.logFail("Failed to create a Descriptor Layout!\n"); - - SPushConstantRange pushConstantRanges[] = { - { - .stageFlags = ShaderStage::ESS_COMPUTE, - .offset = 0, - .size = sizeof(BenchmarkPushConstants) - } - }; - m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); - if (!m_pplnLayout) - base.logFail("Failed to create a Pipeline Layout!\n"); - - { - IGPUComputePipeline::SCreationParams params = {}; - params.layout = m_pplnLayout.get(); - params.shader.entryPoint = "main"; - params.shader.shader = shader.get(); - if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) - { - params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS; - params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - if (!base.m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) - base.logFail("Failed to create pipelines (compile & link shaders)!\n"); - - if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_pipeline->getExecutableInfo()); - base.m_logger->log("EF64Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, report.c_str()); - } - } - - // Allocate the memory - { - static_assert(sizeof(float64_t) == sizeof(benchmark_emulated_float64_t)); - constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X * - BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(float64_t); - - nbl::video::IGPUBuffer::SCreationParams params = {}; - params.size = BufferSize; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - smart_refctd_ptr dummyBuff = base.m_device->createBuffer(std::move(params)); - if (!dummyBuff) - base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - - dummyBuff->setObjectDebugName("benchmark buffer"); - - nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs(); - reqs.memoryTypeBits &= base.m_physicalDevice->getDeviceLocalMemoryTypeBits(); - - m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); - if (!m_allocation.isValid()) - base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - - assert(dummyBuff->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); - - m_ds = pool->createDescriptorSet(std::move(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo info[1]; - info[0].desc = smart_refctd_ptr(dummyBuff); - info[0].info.buffer = { .offset = 0,.size = BufferSize }; - IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info} - }; - base.m_device->updateDescriptorSets(writes, {}); - } - } + constexpr size_t BufferSize = sizeof(TestValues); + + nbl::video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + smart_refctd_ptr outputBuff = base.m_device->createBuffer(std::move(params)); + if (!outputBuff) + base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + outputBuff->setObjectDebugName("emulated_float64_t output buffer"); + + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs(); + reqs.memoryTypeBits &= base.m_physicalDevice->getHostVisibleMemoryTypeBits(); + + m_allocation = base.m_device->allocate(reqs, outputBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_allocation.isValid()) + base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(outputBuff->getBoundMemory().memory == m_allocation.memory.get()); + smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); + + m_ds = pool->createDescriptorSet(std::move(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = smart_refctd_ptr(outputBuff); + info[0].info.buffer = {.offset = 0, .size = BufferSize}; + IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; + base.m_device->updateDescriptorSets(writes, {}); + } } - IQueryPool::SCreationParams queryPoolCreationParams{}; - queryPoolCreationParams.queryType = IQueryPool::TYPE::TIMESTAMP; - queryPoolCreationParams.queryCount = 2; - queryPoolCreationParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; - m_queryPool = m_device->createQueryPool(queryPoolCreationParams); - - m_computeQueue = m_device->getQueue(m_queueFamily, 0); - } - - void run() - { - m_logger->log("\n\nfloat64_t benchmark result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::NATIVE); - m_logger->log("emulated_float64_t benchmark, fast math enabled result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::EF64_FAST_MATH_ENABLED); - m_logger->log("emulated_float64_t benchmark, fast math disabled result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::EF64_FAST_MATH_DISABLED); - // every subgroup with even ID do calculations with the `emulated_float64_t` type, other subgroups do calculations with float64_t - m_logger->log("emulated_float64_t benchmark, subgroup divided work result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::SUBGROUP_DIVIDED_WORK); - // every item does calculations with both emulated and native types - m_logger->log("emulated_float64_t benchmark, interleaved result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::INTERLEAVED); - } - - private: - void performBenchmark(EF64_BENCHMARK_MODE mode) - { - m_device->waitIdle(); - - m_pushConstants.benchmarkMode = mode; - - // [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches] in one cmdbuf, - // one submit. Per-submit semaphore chaining adds sync cost and blocks driver pipelining; - // the cooldown keeps the GPU in steady state across ts 1 so the trailing bench - // dispatches don't land in a winding-down tail. - constexpr int CooldownIterations = WarmupIterations; - - m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1)); - m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_cmdbuf->bindComputePipeline(m_pipeline.get()); - m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); - m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); - for (int i = 0; i < WarmupIterations; ++i) - m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); - m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); - for (int i = 0; i < Iterations; ++i) - m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); - m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); - for (int i = 0; i < CooldownIterations; ++i) - m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); - m_cmdbuf->endDebugMarker(); - m_cmdbuf->end(); - - smart_refctd_ptr semaphore = m_device->createSemaphore(0u); - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; - const IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} - }; - IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = cmdbufs; - submit.signalSemaphores = signalSem; - - m_api->startCapture(); - m_computeQueue->submit({&submit, 1u}); - m_api->endCapture(); - - m_device->waitIdle(); - - const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed(); - const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0; - - m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds); - } - - uint64_t calcTimeElapsed() - { - uint64_t timestamps[2]; - const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); - m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, ×tamps, sizeof(uint64_t), flags); - return timestamps[1] - timestamps[0]; - } - - private: - core::smart_refctd_ptr m_api; - smart_refctd_ptr m_device; - smart_refctd_ptr m_logger; - - nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; - smart_refctd_ptr m_cmdpool = nullptr; - smart_refctd_ptr m_cmdbuf = nullptr; - smart_refctd_ptr m_ds = nullptr; - smart_refctd_ptr m_pplnLayout = nullptr; - BenchmarkPushConstants m_pushConstants; - smart_refctd_ptr m_pipeline; - - smart_refctd_ptr m_queryPool = nullptr; - - uint32_t m_queueFamily; - IQueue* m_computeQueue; - static constexpr int WarmupIterations = 1000; - static constexpr int Iterations = 1000; - using benchmark_emulated_float64_t = emulated_float64_t; - }; - - template - inline bool logFail(const char* msg, Args&&... args) - { - m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); - return false; - } - - std::ofstream m_logFile; + if (!m_allocation.memory->map({0ull, m_allocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ)) + base.logFail("Failed to map the Device Memory!\n"); + } + + // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches + const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize()); + if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + base.m_device->invalidateMappedMemoryRanges(1, &memoryRange); + + assert(memoryRange.valid() && memoryRange.length >= sizeof(TestValues)); + + m_queue = m_base.m_device->getQueue(m_queueFamily, 0); + } + + ~EF64Submitter() + { + m_allocation.memory->unmap(); + } + + void setPushConstants(PushConstants& pc) + { + m_pushConstants = pc; + } + + TestValues submitGetGPUTestValues() + { + // record command buffer + m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); + m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1)); + m_cmdbuf->bindComputePipeline(m_pipeline.get()); + m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); + m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstants), &m_pushConstants); + m_cmdbuf->dispatch(WORKGROUP_SIZE, 1, 1); + m_cmdbuf->endDebugMarker(); + m_cmdbuf->end(); + + IQueue::SSubmitInfo submitInfos[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = {{.cmdbuf = m_cmdbuf.get()}}; + submitInfos[0].commandBuffers = cmdbufs; + const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = m_semaphore.get(), .value = ++m_semaphoreCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; + submitInfos[0].signalSemaphores = signals; + + m_base.m_api->startCapture(); + m_queue->submit(submitInfos); + m_base.m_api->endCapture(); + + m_base.m_device->waitIdle(); + TestValues output; + std::memcpy(&output, static_cast*>(m_allocation.memory->getMappedPointer()), sizeof(TestValues)); + m_base.m_device->waitIdle(); + + return output; + } + + private: + uint32_t m_queueFamily; + nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; + smart_refctd_ptr m_cmdbuf = nullptr; + smart_refctd_ptr m_cmdpool = nullptr; + smart_refctd_ptr m_ds = nullptr; + smart_refctd_ptr m_pplnLayout = nullptr; + PushConstants m_pushConstants; + CompatibilityTest& m_base; + smart_refctd_ptr m_pipeline; + smart_refctd_ptr m_semaphore; + IQueue* m_queue; + uint64_t m_semaphoreCounter; + }; + + void emulated_float64_tests() + { + EF64Submitter submitter(*this); + + auto printTestOutput = [this](const std::string& functionName, const EmulatedFloat64TestOutput& testResult) + { + std::cout << functionName << ": " << std::endl; + + if (!testResult.cpuTestsSucceed) + logFail("Incorrect CPU determinated values!"); + else + m_logger->log("Correct CPU determinated values!", ILogger::ELL_PERFORMANCE); + + if (!testResult.gpuTestsSucceed) + logFail("Incorrect GPU determinated values!"); + else + m_logger->log("Correct GPU determinated values!", ILogger::ELL_PERFORMANCE); + }; + + m_logFile.open("EmulatedFloatTestLog.txt", std::ios::out | std::ios::trunc); + if (!m_logFile.is_open()) + m_logger->log("Failed to open log file!", system::ILogger::ELL_ERROR); + + printTestOutput("emulatedFloat64RandomValuesTest", emulatedFloat64RandomValuesTest(submitter)); + printTestOutput("emulatedFloat64RandomValuesTestContrastingExponents", emulatedFloat64RandomValuesTestContrastingExponents(submitter)); + printTestOutput("emulatedFloat64NegAndPosZeroTest", emulatedFloat64NegAndPosZeroTest(submitter)); + printTestOutput("emulatedFloat64BothValuesInfTest", emulatedFloat64BothValuesInfTest(submitter)); + printTestOutput("emulatedFloat64BothValuesNegInfTest", emulatedFloat64BothValuesNegInfTest(submitter)); + printTestOutput("emulatedFloat64OneValIsInfOtherIsNegInfTest", emulatedFloat64OneValIsInfOtherIsNegInfTest(submitter)); + printTestOutput("emulatedFloat64OneValIsInfTest", emulatedFloat64OneValIsInfTest(submitter)); + printTestOutput("emulatedFloat64OneValIsNegInfTest", emulatedFloat64OneValIsNegInfTest(submitter)); + if (false) // doesn't work for some reason + fast math is enabled by default + printTestOutput("emulatedFloat64BNaNTest", emulatedFloat64BNaNTest(submitter)); + printTestOutput("emulatedFloat64BInfTest", emulatedFloat64OneValIsZeroTest(submitter)); + printTestOutput("emulatedFloat64BNegInfTest", emulatedFloat64OneValIsNegZeroTest(submitter)); + + m_logFile.close(); + } + + template + struct EmulatedFloat64TestValuesInfo + { + emulated_float64_t a; + emulated_float64_t b; + ConstructorTestValues constrTestValues; + TestValues expectedTestValues; + + void fillExpectedTestValues() + { + double aAsDouble = reinterpret_cast(a); + double bAsDouble = reinterpret_cast(b); + + expectedTestValues.a = a.data; + expectedTestValues.b = b.data; + + expectedTestValues.int32CreateVal = bit_cast(double(constrTestValues.int32)); + expectedTestValues.int64CreateVal = bit_cast(double(constrTestValues.int64)); + expectedTestValues.uint32CreateVal = bit_cast(double(constrTestValues.uint32)); + expectedTestValues.uint64CreateVal = bit_cast(double(constrTestValues.uint64)); + expectedTestValues.float32CreateVal = bit_cast(double(constrTestValues.float32)); + expectedTestValues.float64CreateVal = bit_cast(constrTestValues.float64); + expectedTestValues.additionVal = emulated_float64_t::create(aAsDouble + bAsDouble).data; + expectedTestValues.substractionVal = emulated_float64_t::create(aAsDouble - bAsDouble).data; + expectedTestValues.multiplicationVal = emulated_float64_t::create(aAsDouble * bAsDouble).data; + expectedTestValues.divisionVal = emulated_float64_t::create(aAsDouble / bAsDouble).data; + expectedTestValues.lessOrEqualVal = aAsDouble <= bAsDouble; + expectedTestValues.greaterOrEqualVal = aAsDouble >= bAsDouble; + expectedTestValues.equalVal = aAsDouble == bAsDouble; + expectedTestValues.notEqualVal = aAsDouble != bAsDouble; + expectedTestValues.lessVal = aAsDouble < bAsDouble; + expectedTestValues.greaterVal = aAsDouble > bAsDouble; + } + }; + + struct EmulatedFloat64TestOutput + { + bool cpuTestsSucceed; + bool gpuTestsSucceed; + }; + + EmulatedFloat64TestOutput emulatedFloat64LoopedTests_impl(EF64Submitter& submitter, + const uint32_t iterations, + const std::function& determineValueA, + const std::function& determineValueB) + { + EmulatedFloat64TestOutput output = {true, true}; + + std::uniform_int_distribution i32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution i64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution u32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution u64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_real_distribution fDistribution(-100000.0, 100000.0); + + std::random_device rd; + std::mt19937 mt(rd()); + + for (uint32_t i = 0u; i < iterations; ++i) + { + // generate random test values + EmulatedFloat64TestValuesInfo testValInfo; + double aTmp = determineValueA(); + double bTmp = determineValueB(); + testValInfo.a.data = reinterpret_cast::storage_t&>(aTmp); + testValInfo.b.data = reinterpret_cast::storage_t&>(bTmp); + testValInfo.constrTestValues.int32 = i32Distribution(mt); + testValInfo.constrTestValues.int64 = i64Distribution(mt); + testValInfo.constrTestValues.uint32 = u32Distribution(mt); + testValInfo.constrTestValues.uint64 = u64Distribution(mt); + testValInfo.constrTestValues.float32 = fDistribution(mt); + testValInfo.constrTestValues.float64 = fDistribution(mt); + + testValInfo.fillExpectedTestValues(); + auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + + if (!singleTestOutput.cpuTestsSucceed) + output.cpuTestsSucceed = false; + if (!singleTestOutput.gpuTestsSucceed) + output.gpuTestsSucceed = false; + } + + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64RandomValuesTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + + return distribution(mt); + }; + + return emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations, getRandomFloat64, getRandomFloat64); + } + + EmulatedFloat64TestOutput emulatedFloat64RandomValuesTestContrastingExponents(EF64Submitter& submitter) + { + auto getRandomSmallFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-0.01, 0.01); + + return distribution(mt); + }; + + auto getRandomLargeFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(1000000000.0, 2000000000.0); + static std::uniform_int_distribution coinFlipDistribution(0, 1); + + double output = distribution(mt); + if (coinFlipDistribution(mt)) + output = -output; + + return output; + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomSmallFloat64, getRandomLargeFloat64); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomLargeFloat64, getRandomSmallFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64BothValuesNaNTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + const float32_t nan32 = std::numeric_limits::quiet_NaN(); + const float64_t nan64 = std::numeric_limits::quiet_NaN(); + testValInfo.a = emulated_float64_t::create(nan64); + testValInfo.b = emulated_float64_t::create(nan64); + testValInfo.constrTestValues = { + .int32 = std::bit_cast(nan32), + .int64 = std::bit_cast(nan64), + .uint32 = std::bit_cast(nan32), + .uint64 = std::bit_cast(nan64), + .float32 = nan32 + //.float64 = nan64 + }; + + testValInfo.fillExpectedTestValues(); + return performEmulatedFloat64Tests(testValInfo, submitter); + } + + EmulatedFloat64TestOutput emulatedFloat64NegAndPosZeroTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + testValInfo.a = emulated_float64_t::create(ieee754::traits::signMask); + testValInfo.b = emulated_float64_t::create(std::bit_cast(0.0)); + testValInfo.constrTestValues = { + .int32 = 0, + .int64 = 0, + .uint32 = 0, + .uint64 = 0, + .float32 = 0}; + + testValInfo.fillExpectedTestValues(); + auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + std::swap(testValInfo.a, testValInfo.b); + testValInfo.fillExpectedTestValues(); + auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + + return {firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed}; + } + + EmulatedFloat64TestOutput emulatedFloat64BothValuesInfTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + const float32_t inf32 = std::numeric_limits::infinity(); + const float64_t inf64 = std::numeric_limits::infinity(); + testValInfo.a = emulated_float64_t::create(inf64); + testValInfo.b = emulated_float64_t::create(inf64); + testValInfo.constrTestValues = { + .int32 = 0, + .int64 = 0, + .uint32 = 0, + .uint64 = 0, + .float32 = inf32 + //.float64 = inf64 + }; + + testValInfo.fillExpectedTestValues(); + return performEmulatedFloat64Tests(testValInfo, submitter); + } + + EmulatedFloat64TestOutput emulatedFloat64BothValuesNegInfTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + const float32_t inf32 = -std::numeric_limits::infinity(); + const float64_t inf64 = -std::numeric_limits::infinity(); + testValInfo.a = emulated_float64_t::create(inf64); + testValInfo.b = emulated_float64_t::create(inf64); + testValInfo.constrTestValues = { + .int32 = 0, + .int64 = 0, + .uint32 = 0, + .uint64 = 0, + .float32 = inf32 + //.float64 = inf64 + }; + + testValInfo.fillExpectedTestValues(); + return performEmulatedFloat64Tests(testValInfo, submitter); + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsInfOtherIsNegInfTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + const float64_t inf64 = -std::numeric_limits::infinity(); + testValInfo.a = emulated_float64_t::create(inf64); + testValInfo.b = emulated_float64_t::create(inf64); + testValInfo.constrTestValues = { + .int32 = 0, + .int64 = 0, + .uint32 = 0, + .uint64 = 0, + .float32 = 0 + //.float64 = inf64 + }; + + testValInfo.fillExpectedTestValues(); + auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + std::swap(testValInfo.a, testValInfo.b); + testValInfo.fillExpectedTestValues(); + auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + + return {firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed}; + } + + // TODO: fix + EmulatedFloat64TestOutput emulatedFloat64BNaNTest(EF64Submitter& submitter) + { + EmulatedFloat64TestOutput output = {true, true}; + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + for (uint32_t i = 0u; i < EmulatedFloat64TestIterations; ++i) + { + std::random_device rd; + std::mt19937 mt(rd()); + + std::uniform_int_distribution i32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution i64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution u32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution u64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_real_distribution f32Distribution(-100000.0f, 100000.0f); + std::uniform_real_distribution f64Distribution(-100000.0, 100000.0); + + EmulatedFloat64TestValuesInfo testValInfo; + double aTmp = f64Distribution(mt); + double bTmp = std::numeric_limits::quiet_NaN(); + testValInfo.a.data = reinterpret_cast::storage_t&>(aTmp); + testValInfo.b.data = reinterpret_cast::storage_t&>(bTmp); + testValInfo.constrTestValues.int32 = i32Distribution(mt); + testValInfo.constrTestValues.int64 = i64Distribution(mt); + testValInfo.constrTestValues.uint32 = u32Distribution(mt); + testValInfo.constrTestValues.uint64 = u64Distribution(mt); + testValInfo.constrTestValues.float32 = f32Distribution(mt); + //testValInfo.constrTestValues.float64 = f64Distribution(mt); + + testValInfo.fillExpectedTestValues(); + auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + + if (!singleTestOutput.cpuTestsSucceed) + output.cpuTestsSucceed = false; + if (!singleTestOutput.gpuTestsSucceed) + output.gpuTestsSucceed = false; + } + + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsInfTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + return distribution(mt); + }; + + auto getInfinity = []() + { + return std::numeric_limits::infinity(); + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getInfinity); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getInfinity, getRandomFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsNegInfTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + + return distribution(mt); + }; + + auto getNegInfinity = []() + { + return -std::numeric_limits::infinity(); + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegInfinity); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegInfinity, getRandomFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsZeroTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + return distribution(mt); + }; + + auto getZero = []() + { + return 0.0; + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getZero); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getZero, getRandomFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsNegZeroTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + return distribution(mt); + }; + + auto getNegZero = []() + { + return -0.0; + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegZero); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegZero, getRandomFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + template + EmulatedFloat64TestOutput performEmulatedFloat64Tests(EmulatedFloat64TestValuesInfo& testValInfo, EF64Submitter& submitter) + { + emulated_float64_t a = testValInfo.a; + emulated_float64_t b = testValInfo.b; + + const TestValues cpuTestValues = { + .int32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.int32).data, + .int64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.int64).data, + .uint32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.uint32).data, + .uint64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.uint64).data, + .float32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.float32).data, + .float64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.float64).data, + .additionVal = (a + b).data, + .substractionVal = (a - b).data, + .multiplicationVal = (a * b).data, + .divisionVal = (a / b).data, + .lessOrEqualVal = a <= b, + .greaterOrEqualVal = a >= b, + .equalVal = a == b, + .notEqualVal = a != b, + .lessVal = a + b}; + + EmulatedFloat64TestOutput output; + + // cpu validation + output.cpuTestsSucceed = compareEmulatedFloat64TestValues(testValInfo.expectedTestValues, cpuTestValues); + + // gpu validation + PushConstants pc; + pc.a = reinterpret_cast(a); + pc.b = reinterpret_cast(b); + pc.constrTestVals = testValInfo.constrTestValues; + + submitter.setPushConstants(pc); + auto gpuTestValues = submitter.submitGetGPUTestValues(); + + output.gpuTestsSucceed = compareEmulatedFloat64TestValues(testValInfo.expectedTestValues, gpuTestValues); + + return output; + } + + void runEF64Benchmarks() + { + constexpr uint32_t WarmupDispatches = 1000; + constexpr uint64_t TargetBudgetMs = 400; // ~400ms per row + + Aggregator agg(m_logger, m_device, m_physicalDevice, getComputeQueue()->getFamilyIndex()); + agg.applyCli({ + .argv = this->argv, + .defaultOutputPath = "EF64Bench.json", + .appName = "64_EmulatedFloatTest", + }); + + const auto shaderKey = nbl::this_example::builtin::build::get_spirv_key<"benchmark">(m_device.get()); + auto shaderVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(shaderKey); + + // One bench instance per mode -> one report row per mode. std::array + // gives stack-allocated, pointer-stable storage; no parallel + // benchPtrs vector needed since the aggregator iterates the span + // directly. + constexpr std::pair kModes[] = { + {EF64_BENCHMARK_MODE::NATIVE, "native"}, + {EF64_BENCHMARK_MODE::EF64_FAST_MATH_ENABLED, "emulated, fast-math"}, + {EF64_BENCHMARK_MODE::EF64_FAST_MATH_DISABLED, "emulated, strict"}, + {EF64_BENCHMARK_MODE::SUBGROUP_DIVIDED_WORK, "subgroup-divided"}, + {EF64_BENCHMARK_MODE::INTERLEAVED, "interleaved"}, + }; + constexpr size_t N = std::size(kModes); + std::vector benches; + benches.reserve(N); + for (size_t i = 0; i < N; ++i) + { + const auto& [mode, leaf] = kModes[i]; + benches.emplace_back(agg, CEF64Benchmark::SetupData{ + .assetMgr = m_assetMgr, + .name = {"EF64", leaf}, + .mode = mode, + .variant = shaderVariant, + .warmupDispatches = WarmupDispatches, + .targetBudgetMs = TargetBudgetMs, + }); + } + + const RunContext ctx = { + .shape = CEF64Benchmark::shape(), + .targetBudgetMs = TargetBudgetMs, + .sectionLabel = CEF64Benchmark::kSectionLabel, + }; + agg.runSessionAndReport(Aggregator::makeSpan(benches, ctx)); + } + + + template + inline bool logFail(const char* msg, Args&&... args) + { + m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); + return false; + } + + std::ofstream m_logFile; }; NBL_MAIN_FUNC(CompatibilityTest) \ No newline at end of file diff --git a/common/include/nbl/examples/Benchmark/BenchmarkCli.h b/common/include/nbl/examples/Benchmark/BenchmarkCli.h new file mode 100644 index 000000000..abb0912da --- /dev/null +++ b/common/include/nbl/examples/Benchmark/BenchmarkCli.h @@ -0,0 +1,125 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_BENCHMARK_CLI_INCLUDED_ +#define _NBL_COMMON_BENCHMARK_CLI_INCLUDED_ + +#include +#include "nbl/examples/Benchmark/BenchmarkTypes.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace benchmark_cli +{ + +struct ParsedArgs +{ + std::string outputPath; + bool noBaseline = false; + bool noColor = false; + bool helpRequested = false; + std::vector> baselines; // (label, path) + nbl::core::vector> focus; + // Median-of-K window count used for focused rows (see + // IBenchmark::samplesForCurrentRow). Default 3 trades 3 * targetBudgetMs + // wall time for jitter-robust comparisons. + uint32_t focusSamples = 3; +}; + +// Pure: parse argv into a ParsedArgs. Unknown flags are silently ignored; +// the caller decides what to do on help / no-baseline / per-load failure. +inline ParsedArgs parseArgs(std::span argv, std::string defaultOutputPath) +{ + ParsedArgs out; + out.outputPath = std::move(defaultOutputPath); + + for (size_t i = 1; i < argv.size(); ++i) + { + if (argv[i] == "--output" && i + 1 < argv.size()) + out.outputPath = argv[++i]; + else if (argv[i] == "--no-baseline") + out.noBaseline = true; + else if (argv[i] == "--no-color") + out.noColor = true; + else if (argv[i] == "--baseline" && i + 1 < argv.size()) + { + const std::string& spec = argv[++i]; + const auto eq = spec.find('='); + std::string label, path; + if (eq == std::string::npos) + { + path = spec; + const auto stem = std::filesystem::path(path).stem().string(); + label = stem.empty() ? std::string("baseline") : stem; + } + else + { + label = spec.substr(0, eq); + path = spec.substr(eq + 1); + } + out.baselines.emplace_back(std::move(label), std::move(path)); + } + else if (argv[i] == "--focus" && i + 1 < argv.size()) + { + out.focus.push_back(splitFocusSpec(argv[++i])); + } + else if (argv[i] == "--focus-samples" && i + 1 < argv.size()) + { + // Clamp to [1, 32]: 1 disables the median+outlier path, 32 is well past + // the point of diminishing returns (variance of the trimmed mean drops + // ~1/sqrt(K)). from_chars instead of stol to stay no-exceptions per + // Nabla style; malformed input leaves the default in place. + const std::string& s = argv[++i]; + long v = 0; + const auto [_, ec] = std::from_chars(s.data(), s.data() + s.size(), v); + if (ec == std::errc{}) + out.focusSamples = uint32_t(std::clamp(v, 1, 32)); + } + else if (argv[i] == "--help" || argv[i] == "-h") + { + out.helpRequested = true; + } + } + return out; +} + +inline void printHelp(nbl::system::ILogger* logger, std::string_view appName, std::string_view defaultOutputPath) +{ + benchLogFmt(logger, nbl::system::ILogger::ELL_INFO, + "{} CLI:\n" + " --output PATH write this run's report to PATH (default: {})\n" + " --baseline [LABEL=]PATH load PATH as a baseline; LABEL becomes the column header ('vs LABEL').\n" + " repeatable. If LABEL= is omitted, the file's stem is used\n" + " (e.g. main.json -> 'main'). '=' is used instead of ':' so Windows\n" + " drive letters in paths don't collide with the separator.\n" + " --no-baseline skip the default auto-load of the output path\n" + " --no-color disable ANSI color in the live table (also honored: NO_COLOR=1 env var)\n" + " --focus NAME print a focused baseline-comparison table for NAME before the run.\n" + " NAME is the hierarchical name with '>' between segments (whitespace\n" + " around '>' is optional). Repeatable; one row per --focus. The first\n" + " loaded baseline is the reference for inline deltas in this table.\n" + " Example: --focus \"Linear > Linear > 1:1\"\n" + " --focus-samples N run each focused row N times (median + outlier rejection) for\n" + " jitter-robust comparisons. Default 3; clamped to [1, 32]. N=1\n" + " matches the rest-phase single-shot path. Wall time per focused\n" + " row scales linearly with N.\n" + " --help, -h print this help\n" + "\n" + "Default behaviour: with no flags, the prior run's output (if present) is loaded as the single\n" + " 'baseline', and a fresh one is written at the end; iterate-and-compare with no flags needed.\n" + "\n" + "Failed loads (missing/corrupt file) log a warning and continue; the corresponding column reads 'n/a'.", + appName, defaultOutputPath); +} + +} + +#endif diff --git a/common/include/nbl/examples/Benchmark/BenchmarkConsole.h b/common/include/nbl/examples/Benchmark/BenchmarkConsole.h new file mode 100644 index 000000000..e857c36d4 --- /dev/null +++ b/common/include/nbl/examples/Benchmark/BenchmarkConsole.h @@ -0,0 +1,526 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_BENCHMARK_CONSOLE_INCLUDED_ +#define _NBL_COMMON_BENCHMARK_CONSOLE_INCLUDED_ + +#include +#include "nbl/examples/Benchmark/BenchmarkTypes.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Methods templated on the baselines range must expose `.label` and `.rowsByName`. +class BenchmarkConsole +{ + public: + BenchmarkConsole() + { + // https://no-color.org + if (const char* nc = std::getenv("NO_COLOR"); nc && nc[0] != '\0') + m_useAnsi = false; + } + explicit BenchmarkConsole(nbl::core::smart_refctd_ptr logger) + : BenchmarkConsole() + { + m_logger = std::move(logger); + } + + void setLogger(nbl::core::smart_refctd_ptr logger) { m_logger = std::move(logger); } + nbl::system::ILogger* getLogger() const { return m_logger.get(); } + + void setSilent(bool s) { m_silent = s; } + bool silent() const { return m_silent; } + + void setColorEnabled(bool e) { m_useAnsi = e; } + bool colorEnabled() const { return m_useAnsi; } + + // `neutral` is ELL_PERFORMANCE blue (not a full reset) so uncolored cell + // parts inherit the logger's line-wrap color. Only correct because rows / + // banners are all logged at ELL_PERFORMANCE. + struct Ansi + { + static constexpr std::string_view neutral = "\033[34m"; + static constexpr std::string_view reset = "\033[0m"; + static constexpr std::string_view red = "\033[31m"; + static constexpr std::string_view green = "\033[32m"; + static constexpr std::string_view yellow = "\033[33m"; + static constexpr std::string_view cyan = "\033[36m"; + static constexpr std::string_view bold = "\033[1m"; + }; + + // visualWidth excludes ANSI escape bytes (std::format's `{:>{}}` counts + // bytes), so colored cells must be padded manually via padCell. + struct CellOut + { + std::string text; + size_t visualWidth = 0; + }; + + const Format::Widths& widths() const { return m_widths; } + void growWidthFor(std::string_view joined) { m_widths.grow(joined); } + + // Sizes int columns to unchanged-value width, float columns to "value + // (+/-delta)" with delta=0. Changed-int rows overflow; padding every row + // for worst-case wastes ~40% horizontal space on stable runs. + void growForBaseline(const BaselineRow& b) + { + const auto growInt = [&](size_t& w, uint64_t v) + { + if (v == BaselineRow::kAbsent) + return; + w = std::max(w, std::format("{}", v).size()); + }; + growInt(m_widths.regs, b.registerCount); + growInt(m_widths.code, b.codeSizeBytes); + growInt(m_widths.shared, b.sharedMemBytes); + growInt(m_widths.local, b.privateMemBytes); + + if (b.psPerSample > 0.0) + { + m_widths.psSample = std::max(m_widths.psSample, floatCellPlainText(b.psPerSample, 0.0).size()); + const double gsBase = 1000.0 / b.psPerSample; + m_widths.gsamples = std::max(m_widths.gsamples, floatCellPlainText(gsBase, 0.0).size()); + } + } + + // Pre-register so the header (logged once up front) doesn't stay narrower than later rows. + void registerVariant(std::span name) { m_widths.grow(joinName(name)); } + void registerVariant(std::initializer_list name) + { + std::vector tmp; + tmp.reserve(name.size()); + for (auto s : name) + tmp.emplace_back(s); + m_widths.grow(joinName(tmp)); + } + + void logSectionBanner(std::string_view banner) const + { + if (banner.empty()) + return; + if (m_useAnsi) + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}{}{}{}", Ansi::bold, Ansi::cyan, banner, Ansi::reset); + else + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", banner); + } + + // Once per session, not per span, otherwise readers see the same text N times. + template + void logBannerNotes(const Baselines& baselines) const + { + if (std::empty(baselines)) + return; + const auto& primary = *std::begin(baselines); + const bool multi = std::distance(std::begin(baselines), std::end(baselines)) > 1; + const std::string primaryLabel = primary.label; + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "Note: ps/sample lower = faster; GSamples/s higher = faster. Inline annotations compare to primary baseline '{}': " + "floats show 'value (+/-delta)' always; ints show 'old -> new' only when changed.", + primaryLabel); + if (multi) + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "Note: trailing 'vs LABEL' columns carry raw ps/sample deltas against secondary baselines (primary skipped, shown inline)."); + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "Note: '[WG!]' on a delta = baseline's workload shape (workgroup / dispatch / samplesPerDispatch) differs from this run, comparison is apples-to-oranges."); + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "Note: float deltas only get green/red coloring when the relative change is >= {:.0f}% (typical GPU jitter is 1-2%); smaller deltas stay neutral.", + kFloatColorThreshold * 100.0); + } + + template + void logHeader(const Baselines& baselines) const + { + std::string line = std::format("{:<{}} | {:>{}} | {:>{}} | {:>{}} | {:>{}} | {:>{}} | {:>{}}", + "Name", m_widths.name, + "ps/sample", m_widths.psSample, + "GSamples/s", m_widths.gsamples, + "regs", m_widths.regs, + "code(B)", m_widths.code, + "shared(B)", m_widths.shared, + "local(B)", m_widths.local); + // Primary is shown inline on every value column; only secondaries get trailing columns. + size_t idx = 0; + for (const auto& b : baselines) + { + if (idx++ == 0) + continue; + const std::string col = std::format("vs {}", b.label); + line += std::format(" | {:>{}}", col, baselineColWidth(b.label)); + } + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", line); + } + + template + void logRow(std::span name, std::string_view joinedName, + const TimingResult& t, const PipelineStats& s, + const std::unordered_map& rowBaselines, + const Baselines& baselines) const + { + if (!m_logger || m_silent) + return; + + const BaselineRow* primary = nullptr; + if (!std::empty(baselines)) + { + const std::string key = makeKey(name); + const auto& b0 = *std::begin(baselines); + if (auto it = b0.rowsByName.find(key); it != b0.rowsByName.end()) + primary = &it->second; + } + + // ps_per_sample * GSamples/s == 1000 (see runTimed), so GSamples is derived not stored. + const auto baselineGSamples = primary ? std::optional{primary->psPerSample > 0.0 ? 1000.0 / primary->psPerSample : 0.0} : std::nullopt; + + std::string line = std::format("{:<{}}", joinedName, m_widths.name); + line += " | " + padCell(formatFloatCell(t.ps_per_sample, primary ? std::optional{primary->psPerSample} : std::nullopt, true), m_widths.psSample); + line += " | " + padCell(formatFloatCell(t.gsamples_per_s, baselineGSamples, false), m_widths.gsamples); + line += " | " + padCell(formatIntCell(s.registerCount, primary ? primary->registerCount : BaselineRow::kAbsent), m_widths.regs); + line += " | " + padCell(formatIntCell(s.codeSizeBytes, primary ? primary->codeSizeBytes : BaselineRow::kAbsent), m_widths.code); + line += " | " + padCell(formatIntCell(s.sharedMemBytes, primary ? primary->sharedMemBytes : BaselineRow::kAbsent), m_widths.shared); + line += " | " + padCell(formatIntCell(s.privateMemBytes, primary ? primary->privateMemBytes : BaselineRow::kAbsent), m_widths.local); + + size_t idx = 0; + for (const auto& b : baselines) + { + if (idx++ == 0) + continue; + std::string plain; + bool better = false; + bool significant = false; + bool haveValue = false; + bool flagShape = false; + if (auto it = rowBaselines.find(b.label); it != rowBaselines.end() && it->second.psPerSample > 0.0) + { + const double delta = t.ps_per_sample - it->second.psPerSample; + plain = std::format("{:+.3f}", delta); + better = delta < 0.0; + significant = std::abs(delta) / it->second.psPerSample >= kFloatColorThreshold; + haveValue = true; + flagShape = it->second.shapeMismatch; + } + else + { + plain = "n/a"; + } + std::string suffix = flagShape ? std::string(" [WG!]") : std::string(); + CellOut cell; + cell.visualWidth = plain.size() + suffix.size(); + if (!m_useAnsi) + { + cell.text = plain + suffix; + } + else + { + const bool paint = haveValue && significant; + const std::string_view col = paint ? (better ? Ansi::green : Ansi::red) : std::string_view{}; + std::string coloredPlain = paint + ? std::format("{}{}{}", col, plain, Ansi::neutral) + : plain; + std::string coloredSuffix = flagShape + ? std::format("{}{}{}{}", Ansi::bold, Ansi::red, suffix, Ansi::neutral) + : std::string(); + cell.text = coloredPlain + coloredSuffix; + } + line += " | " + padCell(cell, baselineColWidth(b.label)); + } + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", line); + } + + // Flat table, one row per (variant, stat); each baseline gets one delta column: + // + // Name | stat | current | vs iter47 | vs iter48 + // X | ps/sample | 2.151 | -0.044 | +0.123 + // X | GSamples/s | 464.9 | +9.456 | -7.234 + // X | regs | 40 | +0 | +0 + // X | code(B) | 4992 | +128 | 0 + template + void printBaselineComparison(std::span> names, + const Baselines& baselines, const Results& results) const + { + if (!m_logger || names.empty()) + return; + if (std::empty(baselines)) + { + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_WARNING, + "--focus requested {} variant(s) but no baselines are loaded, nothing to compare against. " + "Did your --baseline paths fail to load?", + names.size()); + return; + } + + struct Current + { + TimingResult t; + PipelineStats s; + Workload w; + bool present = false; + }; + std::unordered_map currentByKey; + currentByKey.reserve(std::size(results)); + for (const auto& r : results) + currentByKey[makeKey(r.name)] = {r.timing, r.stats, r.workload, true}; + + const size_t baselineCount = static_cast(std::distance(std::begin(baselines), std::end(baselines))); + + std::vector> rows; + rows.reserve(1 + names.size() * 6); + + { + auto plainCell = [](std::string s) -> CellOut { const size_t w = s.size(); return {std::move(s), w}; }; + std::vector header; + header.reserve(3 + baselineCount); + header.push_back(plainCell("Name")); + header.push_back(plainCell("stat")); + header.push_back(plainCell("current")); + for (const auto& b : baselines) + header.push_back(plainCell(std::format("vs {}", b.label))); + rows.push_back(std::move(header)); + } + + auto floatStatRow = [&](const char* label, std::string_view joined, bool have, double curV, + const Workload& curW, const std::string& key, + auto baselineLookup /*BaselineRow -> double*/, bool lowerIsBetter) + { + auto plainCell = [](std::string s) -> CellOut { const size_t w = s.size(); return {std::move(s), w}; }; + std::vector row; + row.reserve(3 + baselineCount); + row.push_back(plainCell(std::string(joined))); + row.push_back(plainCell(label)); + row.push_back(have ? plainCell(formatFloat5(curV)) : plainCell("n/a")); + + for (const auto& b : baselines) + { + auto bit = b.rowsByName.find(key); + if (!have || bit == b.rowsByName.end()) + { + row.push_back(plainCell("n/a")); + continue; + } + const double baseV = baselineLookup(bit->second); + if (baseV <= 0.0) + { + row.push_back(plainCell("n/a")); + continue; + } + const bool shapeMismatch = curW.present() && bit->second.workload.present() && (curW.shape != bit->second.workload.shape); + const double delta = curV - baseV; + const std::string deltaStr = std::format("{}{}", delta >= 0 ? "+" : "-", formatFloat5(std::abs(delta))); + const bool significant = std::abs(delta) / baseV >= kFloatColorThreshold; + const std::string suffix = shapeMismatch ? std::string(" [WG!]") : std::string(); + CellOut cell; + cell.visualWidth = deltaStr.size() + suffix.size(); + if (!m_useAnsi || !significant) + { + cell.text = m_useAnsi && shapeMismatch + ? std::format("{}{}{}{}{}", deltaStr, Ansi::bold, Ansi::red, suffix, Ansi::neutral) + : deltaStr + suffix; + } + else + { + const bool better = (lowerIsBetter && delta < 0.0) || (!lowerIsBetter && delta > 0.0); + const std::string_view col = better ? Ansi::green : Ansi::red; + std::string coloredDelta = std::format("{}{}{}", col, deltaStr, Ansi::neutral); + std::string coloredSuffix = shapeMismatch + ? std::format("{}{}{}{}", Ansi::bold, Ansi::red, suffix, Ansi::neutral) + : std::string(); + cell.text = coloredDelta + coloredSuffix; + } + row.push_back(std::move(cell)); + } + rows.push_back(std::move(row)); + }; + + auto intStatRow = [&](const char* label, std::string_view joined, bool have, uint64_t curV, + const Workload& curW, const std::string& key, uint64_t BaselineRow::* baseField) + { + auto plainCell = [](std::string s) -> CellOut { const size_t w = s.size(); return {std::move(s), w}; }; + std::vector row; + row.reserve(3 + baselineCount); + row.push_back(plainCell(std::string(joined))); + row.push_back(plainCell(label)); + row.push_back(have ? plainCell(std::format("{}", curV)) : plainCell("n/a")); + + for (const auto& b : baselines) + { + auto bit = b.rowsByName.find(key); + if (!have || bit == b.rowsByName.end()) + { + row.push_back(plainCell("n/a")); + continue; + } + const uint64_t baseV = bit->second.*baseField; + if (baseV == BaselineRow::kAbsent) + { + row.push_back(plainCell("n/a")); + continue; + } + const bool shapeMismatch = curW.present() && bit->second.workload.present() && (curW.shape != bit->second.workload.shape); + const int64_t delta = int64_t(curV) - int64_t(baseV); + const std::string deltaStr = std::format("{:+d}", delta); + const std::string suffix = shapeMismatch ? std::string(" [WG!]") : std::string(); + CellOut cell; + cell.visualWidth = deltaStr.size() + suffix.size(); + if (!m_useAnsi) + { + cell.text = deltaStr + suffix; + } + else + { + std::string coloredDelta = delta != 0 + ? std::format("{}{}{}", Ansi::yellow, deltaStr, Ansi::neutral) + : deltaStr; + std::string coloredSuffix = shapeMismatch + ? std::format("{}{}{}{}", Ansi::bold, Ansi::red, suffix, Ansi::neutral) + : std::string(); + cell.text = coloredDelta + coloredSuffix; + } + row.push_back(std::move(cell)); + } + rows.push_back(std::move(row)); + }; + + for (const auto& nameVec : names) + { + const std::string joined = joinName(nameVec); + const std::string key = makeKey(nameVec); + const auto cit = currentByKey.find(key); + const bool have = (cit != currentByKey.end()) && cit->second.present; + const auto& t = have ? cit->second.t : TimingResult {}; + const auto& s = have ? cit->second.s : PipelineStats {}; + const auto& w = have ? cit->second.w : Workload {}; + + floatStatRow("ps/sample", joined, have, t.ps_per_sample, w, key, + [](const BaselineRow& b) { return b.psPerSample; }, true); + floatStatRow("GSamples/s", joined, have, t.gsamples_per_s, w, key, + [](const BaselineRow& b) { return b.psPerSample > 0.0 ? 1000.0 / b.psPerSample : 0.0; }, false); + intStatRow("regs", joined, have, s.registerCount, w, key, &BaselineRow::registerCount); + intStatRow("code(B)", joined, have, s.codeSizeBytes, w, key, &BaselineRow::codeSizeBytes); + intStatRow("shared(B)", joined, have, s.sharedMemBytes, w, key, &BaselineRow::sharedMemBytes); + intStatRow("local(B)", joined, have, s.privateMemBytes, w, key, &BaselineRow::privateMemBytes); + } + + const size_t nCols = 3 + baselineCount; + std::vector colWidths(nCols, 0); + for (const auto& r : rows) + for (size_t i = 0; i < r.size() && i < nCols; ++i) + colWidths[i] = std::max(colWidths[i], r[i].visualWidth); + + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "=== Focus comparison ({} variant(s) vs {} baseline(s); ps/sample lower is better, integer deltas are absolute) ===", + names.size(), baselineCount); + auto leftPad = [](const CellOut& c, size_t targetWidth) -> std::string + { + if (c.visualWidth >= targetWidth) + return c.text; + return c.text + std::string(targetWidth - c.visualWidth, ' '); + }; + for (size_t ri = 0; ri < rows.size(); ++ri) + { + std::string line; + for (size_t ci = 0; ci < rows[ri].size(); ++ci) + { + if (ci) + line.append(" | "); + if (ci <= 1) + line += leftPad(rows[ri][ci], colWidths[ci]); + else + line += padCell(rows[ri][ci], colWidths[ci]); + } + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", line); + } + } + + private: + static constexpr size_t kBaselineMinColWidth = 10; + size_t baselineColWidth(std::string_view label) const + { + return std::max(kBaselineMinColWidth, std::string_view("vs ").size() + label.size()); + } + + // Typical GPU jitter is 1-2%; coloring below 5% would mostly highlight noise. + static constexpr double kFloatColorThreshold = 0.05; + + // std::format counts ANSI escape bytes, so `{:>N}` can't pad colored cells. + std::string padCell(const CellOut& c, size_t targetWidth) const + { + if (c.visualWidth >= targetWidth) + return c.text; + return std::string(targetWidth - c.visualWidth, ' ') + c.text; + } + + // "regs 40 -> 54" is more useful than "+14 from somewhere", show both endpoints. + CellOut formatIntCell(uint64_t current, uint64_t baseline) const + { + if (baseline == BaselineRow::kAbsent || baseline == current) + { + auto s = std::format("{}", current); + const size_t w = s.size(); + return {std::move(s), w}; + } + const std::string baseStr = std::format("{}", baseline); + const std::string curStr = std::format("{}", current); + const std::string plain = std::format("{} -> {}", baseStr, curStr); + const size_t visW = plain.size(); + if (!m_useAnsi) + return {plain, visW}; + auto colored = std::format("{}{} -> {}{}", Ansi::yellow, baseStr, curStr, Ansi::neutral); + return {std::move(colored), visW}; + } + + // ~5 chars including the decimal point, so column widths stay predictable + // across ps/sample (0.5..100) and GSamples/s (0.03..1000+). + static std::string formatFloat5(double v) + { + const double mag = std::abs(v); + if (mag >= 10000.0) return std::format("{:.0f}", v); + if (mag >= 1000.0) return std::format("{:.1f}", v); + if (mag >= 100.0) return std::format("{:.1f}", v); + if (mag >= 10.0) return std::format("{:.2f}", v); + return std::format("{:.3f}", v); + } + + static std::string floatCellPlainText(double value, double delta) + { + const std::string deltaStr = std::format("{}{}", delta >= 0 ? "+" : "-", formatFloat5(std::abs(delta))); + return std::format("{} ({})", formatFloat5(value), deltaStr); + } + + CellOut formatFloatCell(double current, std::optional baseline, bool lowerIsBetter) const + { + if (!baseline.has_value() || *baseline <= 0.0) + { + auto s = formatFloat5(current); + const size_t w = s.size(); + return {std::move(s), w}; + } + const double delta = current - *baseline; + const std::string plain = floatCellPlainText(current, delta); + const size_t visW = plain.size(); + const bool significant = std::abs(delta) / *baseline >= kFloatColorThreshold; + if (!m_useAnsi || !significant) + return {plain, visW}; + const std::string valStr = formatFloat5(current); + const std::string deltaStr = std::format("{}{}", delta >= 0 ? "+" : "-", formatFloat5(std::abs(delta))); + const bool better = (lowerIsBetter && delta < 0.0) || (!lowerIsBetter && delta > 0.0); + const std::string_view color = better ? Ansi::green : Ansi::red; + auto colored = std::format("{} ({}{}{})", valStr, color, deltaStr, Ansi::neutral); + return {std::move(colored), visW}; + } + + nbl::core::smart_refctd_ptr m_logger; + Format::Widths m_widths; + bool m_silent = false; + bool m_useAnsi = true; +}; + +#endif diff --git a/common/include/nbl/examples/Benchmark/BenchmarkJson.h b/common/include/nbl/examples/Benchmark/BenchmarkJson.h new file mode 100644 index 000000000..dc7846848 --- /dev/null +++ b/common/include/nbl/examples/Benchmark/BenchmarkJson.h @@ -0,0 +1,285 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_BENCHMARK_JSON_INCLUDED_ +#define _NBL_COMMON_BENCHMARK_JSON_INCLUDED_ + +#include +#include "nbl/examples/Benchmark/BenchmarkTypes.h" +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include +#include + +namespace benchmark_json +{ + +// Builds the "device" JSON object from a physical device, or null if dev is null. +inline nlohmann::json buildDeviceMetadata(const nbl::video::IPhysicalDevice* dev) +{ + if (!dev) + return nullptr; + const auto& p = dev->getProperties(); + nlohmann::json out = nlohmann::json::object(); + out["name"] = std::string(p.deviceName); + out["vendorID"] = p.vendorID; + out["deviceID"] = p.deviceID; + out["driverID"] = static_cast(p.driverID); + out["driverName"] = std::string(p.driverName); + out["driverInfo"] = std::string(p.driverInfo); + out["driverVersion"] = p.driverVersion; + out["deviceUUID"] = std::vector(p.deviceUUID, p.deviceUUID + 16); + out["driverUUID"] = std::vector(p.driverUUID, p.driverUUID + 16); + return out; +} + +// Parses a JSON report file into a Baseline. Returns nullopt on missing / +// unparseable / empty file. Caller is responsible for appending / replacing +// in their baseline store and for feeding rows into BenchmarkConsole widths. +inline std::optional loadBaselineFile(std::string label, const std::string& path) +{ + std::ifstream f(path); + if (!f.is_open()) + return std::nullopt; + + nlohmann::json j; + try + { + f >> j; + } + catch (const std::exception&) + { + return std::nullopt; + } + + const auto resultsIt = j.find("results"); + if (resultsIt == j.end() || !resultsIt->is_array()) + return std::nullopt; + + std::unordered_map rowsByName; + for (const auto& r : *resultsIt) + { + const auto n = r.find("name"); + const auto ps = r.find("ps_per_sample"); + if (n == r.end() || ps == r.end()) + continue; + if (!n->is_array() || !ps->is_number()) + continue; + std::vector nameVec; + nameVec.reserve(n->size()); + for (const auto& seg : *n) + { + if (!seg.is_string()) + { + nameVec.clear(); + break; + } + nameVec.emplace_back(seg.get()); + } + if (nameVec.empty()) + continue; + + BaselineRow row; + row.psPerSample = ps->get(); + row.registerCount = r.at("regs").get(); + row.codeSizeBytes = r.at("code_bytes").get(); + row.sharedMemBytes = r.at("shared_mem_bytes").get(); + row.privateMemBytes = r.at("local_mem_bytes").get(); + row.stackBytes = r.at("stack_bytes").get(); + row.subgroupSize = r.at("subgroup_size").get(); + + auto readUvec3 = [&](const char* key, nbl::hlsl::uint32_t3& out) + { + const auto& a = r.at(key); + out.x = a[0].get(); + out.y = a[1].get(); + out.z = a[2].get(); + }; + readUvec3("workgroup_size", row.workload.shape.workgroupSize); + readUvec3("dispatch_groups", row.workload.shape.dispatchGroupCount); + row.workload.shape.samplesPerDispatch = r.at("samples_per_dispatch").get(); + row.workload.benchDispatches = r.at("bench_dispatches").get(); + + rowsByName[makeKey(nameVec)] = row; + } + if (rowsByName.empty()) + return std::nullopt; + + return Baseline {std::move(label), path, j.at("device"), std::move(rowsByName)}; +} + +// Writes a JSON report. Preserves rows in the prior file whose names weren't +// re-measured this run, so writeReportFile can be an intermediate checkpoint +// during a multi-bench-class session. Returns preservedCount via out-param. +inline bool writeReportFile(const std::string& path, const nlohmann::json& deviceMetadata, const std::vector& baselines, const std::vector& results, nbl::system::ILogger* logger, size_t* outPreservedCount = nullptr) +{ + nlohmann::json doc; + doc["version"] = 1; + + if (!deviceMetadata.is_null()) + doc["device"] = deviceMetadata; + + if (!baselines.empty()) + { + auto& baselinesNode = doc["baselines"] = nlohmann::json::object(); + for (const auto& b : baselines) + baselinesNode[b.label] = b.path; + } + auto& resultsNode = doc["results"] = nlohmann::json::array(); + + std::unordered_set currentKeys; + currentKeys.reserve(results.size()); + for (const auto& r : results) + currentKeys.insert(makeKey(r.name)); + + for (const auto& r : results) + { + nlohmann::json row; + row["name"] = r.name; + row["ps_per_sample"] = r.timing.ps_per_sample; + row["gsamples_per_s"] = r.timing.gsamples_per_s; + row["ms_total"] = r.timing.ms_total; + row["regs"] = r.stats.registerCount; + row["code_bytes"] = r.stats.codeSizeBytes; + row["shared_mem_bytes"] = r.stats.sharedMemBytes; + row["local_mem_bytes"] = r.stats.privateMemBytes; + row["stack_bytes"] = r.stats.stackBytes; + row["subgroup_size"] = r.stats.subgroupSize; + + // Structured so JSON preserves the exact numeric type. + if (!r.stats.unknowns.empty()) + { + using F = nbl::video::IGPUPipelineBase::SExecutableStatistic::FORMAT; + auto& arr = row["unknown_stats"] = nlohmann::json::array(); + for (const auto& s : r.stats.unknowns) + { + nlohmann::json entry; + entry["name"] = s.name; + switch (s.format) + { + case F::BOOL32: + entry["type"] = "bool"; + entry["value"] = s.value.b32; + break; + case F::INT64: + entry["type"] = "int"; + entry["value"] = s.value.i64; + break; + case F::UINT64: + entry["type"] = "uint"; + entry["value"] = s.value.u64; + break; + case F::FLOAT64: + entry["type"] = "float"; + entry["value"] = s.value.f64; + break; + } + arr.push_back(std::move(entry)); + } + } + + row["workgroup_size"] = {r.workload.shape.workgroupSize.x, r.workload.shape.workgroupSize.y, r.workload.shape.workgroupSize.z}; + row["dispatch_groups"] = {r.workload.shape.dispatchGroupCount.x, r.workload.shape.dispatchGroupCount.y, r.workload.shape.dispatchGroupCount.z}; + row["samples_per_dispatch"] = r.workload.shape.samplesPerDispatch; + row["bench_dispatches"] = r.workload.benchDispatches; + + resultsNode.push_back(std::move(row)); + } + + // Caveat: renamed/removed variants linger forever. Delete the output JSON + // to get a clean slate. + size_t preservedCount = 0; + { + std::ifstream in(path); + if (in.is_open()) + { + nlohmann::json existing; + try + { + in >> existing; + } + catch (const std::exception&) + { + existing = nullptr; + } + const auto rIt = existing.find("results"); + if (rIt != existing.end() && rIt->is_array()) + { + for (const auto& priorRow : *rIt) + { + const auto n = priorRow.find("name"); + if (n == priorRow.end() || !n->is_array()) + continue; + std::vector nameVec; + bool ok = true; + for (const auto& seg : *n) + { + if (!seg.is_string()) + { + ok = false; + break; + } + nameVec.emplace_back(seg.get()); + } + if (!ok || nameVec.empty()) + continue; + if (currentKeys.find(makeKey(nameVec)) != currentKeys.end()) + continue; // re-measured this run + + resultsNode.push_back(priorRow); + ++preservedCount; + } + } + } + } + + std::ofstream f(path, std::ios::out | std::ios::trunc); + if (!f.is_open()) + { + benchLogFmt(logger, nbl::system::ILogger::ELL_ERROR, "benchmark_json::writeReportFile: failed to open '{}'", path); + return false; + } + + // One result per line keeps `git diff` showing one row per change instead + // of N lines per row. + f << "{\n"; + f << " \"version\": " << doc["version"].dump() << ",\n"; + if (doc.contains("device")) + { + // Compact value render so byte arrays (deviceUUID etc.) stay inline. + const auto& dev = doc["device"]; + f << " \"device\": {\n"; + bool first = true; + for (auto it = dev.begin(); it != dev.end(); ++it) + { + if (!first) + f << ",\n"; + first = false; + f << " \"" << it.key() << "\": " << it.value().dump(); + } + f << "\n },\n"; + } + if (doc.contains("baselines")) + f << " \"baselines\": " << doc["baselines"].dump() << ",\n"; + f << " \"results\": ["; + for (size_t i = 0; i < resultsNode.size(); ++i) + { + f << (i ? ",\n " : "\n "); + f << resultsNode[i].dump(); + } + f << (resultsNode.empty() ? "]\n" : "\n ]\n"); + f << "}\n"; + + if (outPreservedCount) + *outPreservedCount = preservedCount; + return true; +} + +} // namespace benchmark_json + +#endif diff --git a/common/include/nbl/examples/Benchmark/BenchmarkTypes.h b/common/include/nbl/examples/Benchmark/BenchmarkTypes.h new file mode 100644 index 000000000..274c19514 --- /dev/null +++ b/common/include/nbl/examples/Benchmark/BenchmarkTypes.h @@ -0,0 +1,211 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_BENCHMARK_TYPES_INCLUDED_ +#define _NBL_COMMON_BENCHMARK_TYPES_INCLUDED_ + +#include +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +struct PipelineStats +{ + uint64_t registerCount = 0; + uint64_t codeSizeBytes = 0; + uint64_t sharedMemBytes = 0; + uint64_t privateMemBytes = 0; + uint64_t stackBytes = 0; + uint32_t subgroupSize = 0; + std::string raw; + + // Driver stats matchStat didn't recognise. Structured (not lossy-stringified + // into `raw`) so JSON round-trips the correct numeric type. + std::vector unknowns; +}; + +struct TimingResult +{ + float64_t elapsed_ns = 0.0; + uint64_t totalSamples = 0; + float64_t ps_per_sample = 0.0; + float64_t gsamples_per_s = 0.0; + float64_t ms_total = 0.0; +}; + +struct Format +{ + struct Widths + { + size_t name = std::string_view("Name").size(); + size_t psSample = std::string_view("ps/sample").size(); + size_t gsamples = std::string_view("GSamples/s").size(); + size_t regs = std::string_view("regs").size(); + size_t code = std::string_view("code(B)").size(); + size_t shared = std::string_view("shared(B)").size(); + size_t local = std::string_view("local(B)").size(); + + void grow(std::string_view joinedName) { name = std::max(name, joinedName.size()); } + }; + + static std::string headerBase(const Widths& w = {}) + { + return std::format("{:<{}} | {:>12} | {:>12} | {:>6} | {:>8} | {:>12} | {:>12}", + "Name", w.name, "ps/sample", "GSamples/s", "regs", "code(B)", "shared(B)", "local(B)"); + } + + static std::string dataBase(const Widths& w, std::string_view joinedName, const TimingResult& t, const PipelineStats& s) + { + return std::format("{:<{}} | {:>12.3f} | {:>12.3f} | {:>6} | {:>8} | {:>12} | {:>12}", + joinedName, w.name, t.ps_per_sample, t.gsamples_per_s, s.registerCount, s.codeSizeBytes, s.sharedMemBytes, s.privateMemBytes); + } +}; + +// The "what was measured" part of a workload. Workload (adds benchDispatches) +// and RunContext (adds banner label + budget) both embed a WorkloadShape, so +// the shape can be sliced into either from the other. +struct WorkloadShape +{ + nbl::hlsl::uint32_t3 workgroupSize = {0, 0, 0}; + nbl::hlsl::uint32_t3 dispatchGroupCount = {0, 0, 0}; + uint64_t samplesPerDispatch = 0; + + inline bool operator==(const WorkloadShape& other) const + { + return workgroupSize == other.workgroupSize && dispatchGroupCount == other.dispatchGroupCount && samplesPerDispatch == other.samplesPerDispatch; + } + + inline bool operator!=(const WorkloadShape& other) const + { + return !(*this == other); + } +}; + +struct Workload +{ + WorkloadShape shape; + uint32_t benchDispatches = 0; + + // Default-constructed (all zeros) signals "not recorded". + bool present() const { return shape.samplesPerDispatch != 0; } +}; + +struct BaselineRow +{ + // UINT64_MAX sentinel: no real pipeline stat reaches that magnitude, so an + // "absent" field can't collide with a real value. The current run can also + // produce kAbsent when a driver doesn't expose a given stat. + static constexpr uint64_t kAbsent = std::numeric_limits::max(); + + float64_t psPerSample = 0.0; + uint64_t registerCount = kAbsent; + uint64_t codeSizeBytes = kAbsent; + uint64_t sharedMemBytes = kAbsent; + uint64_t privateMemBytes = kAbsent; + uint64_t stackBytes = kAbsent; + uint64_t subgroupSize = kAbsent; // uint64_t (not 32) to share kAbsent semantics + Workload workload {}; +}; + +// Per-baseline reference for a single row: the baseline's ps/sample plus +// whether its recorded workload shape differs from this run (renders the +// "[WG!]" marker so the reader knows the comparison is questionable). +struct BaselineRef +{ + float64_t psPerSample = 0.0; + bool shapeMismatch = false; +}; + +struct Result +{ + // Hierarchical name, outermost first. Tooling can group by any prefix; the + // console joins with " > ". + nbl::core::vector name; + TimingResult timing {}; + PipelineStats stats {}; + Workload workload {}; + std::unordered_map baselines; +}; + +inline std::string joinName(std::span name, std::string_view sep = " > ") +{ + std::string out; + for (size_t i = 0; i < name.size(); ++i) + { + if (i) + out.append(sep); + out.append(name[i]); + } + return out; +} + +// Unit-separator (\x1f) between segments so makeKey can't collide with any +// user-supplied content. +inline std::string makeKey(std::span name) +{ + std::string k; + size_t total = 0; + for (const auto& s : name) + total += s.size() + 1; + k.reserve(total); + for (size_t i = 0; i < name.size(); ++i) + { + if (i) + k.push_back('\x1f'); + k.append(name[i]); + } + return k; +} + +inline nbl::core::vector splitFocusSpec(std::string_view spec) +{ + auto trim = [](std::string_view s) + { + while (!s.empty() && (s.front() == ' ' || s.front() == '\t')) + s.remove_prefix(1); + while (!s.empty() && (s.back() == ' ' || s.back() == '\t')) + s.remove_suffix(1); + return s; + }; + nbl::core::vector out; + size_t start = 0; + while (start <= spec.size()) + { + size_t end = spec.find('>', start); + if (end == std::string_view::npos) + end = spec.size(); + const auto seg = trim(spec.substr(start, end - start)); + if (!seg.empty()) + out.emplace_back(seg); + if (end == spec.size()) + break; + start = end + 1; + } + return out; +} + +struct Baseline +{ + std::string label; + std::string path; + nlohmann::json device; // top-level "device" field from the file, or null if absent + std::unordered_map rowsByName; // makeKey(name) -> stats +}; + +template +inline void benchLogFmt(nbl::system::ILogger* logger, nbl::system::ILogger::E_LOG_LEVEL level, std::string_view fmt, const Args&... args) +{ + if (!logger) + return; + logger->log("%s", level, std::vformat(fmt, std::make_format_args(args...)).c_str()); +} + +#endif diff --git a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h new file mode 100644 index 000000000..b5bd69a13 --- /dev/null +++ b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h @@ -0,0 +1,693 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_GPU_BENCHMARK_HELPER_INCLUDED_ +#define _NBL_COMMON_GPU_BENCHMARK_HELPER_INCLUDED_ + +#include +#include "nbl/examples/examples.hpp" +#include "nbl/examples/Benchmark/BenchmarkTypes.h" +#include "nbl/asset/utils/CCompilerSet.h" +#include "nbl/asset/utils/IShaderCompiler.h" + +#include +#include +#include +#include +#include +#include +#include + +class GPUBenchmarkHelper +{ +public: + struct InitData + { + nbl::core::smart_refctd_ptr device; + nbl::core::smart_refctd_ptr logger; + nbl::video::IPhysicalDevice* physicalDevice = nullptr; + uint32_t computeFamilyIndex = 0; + nbl::hlsl::uint32_t3 dispatchGroupCount = {0, 0, 0}; + uint64_t samplesPerDispatch = 0; + }; + + // One shader source for a benchmark variant. Picks ONE of two paths: + // * Precompiled: `precompiledKey` is a SPIRV asset key from CMake-time + // NBL_CREATE_NSC_COMPILE_RULES. `defines` is ignored. + // * Runtime: `sourcePath` is an .hlsl file resolved against "app_resources", + // compiled at load time with `defines` as -D macros. Use this for fast + // variant iteration without reconfiguring CMake. + struct ShaderVariant + { + // SMacroDefinition uses string_view; this struct owns the backing strings. + struct Define + { + std::string identifier; + std::string definition; + }; + + std::string sourcePath; + std::string precompiledKey; + std::vector defines; + nbl::asset::IShader::E_SHADER_STAGE stage = nbl::asset::IShader::E_SHADER_STAGE::ESS_COMPUTE; + + static ShaderVariant Precompiled(std::string key) + { + ShaderVariant v; + v.precompiledKey = std::move(key); + return v; + } + static ShaderVariant FromSource(std::string path, std::vector defs = {}, nbl::asset::IShader::E_SHADER_STAGE stage = nbl::asset::IShader::E_SHADER_STAGE::ESS_COMPUTE) + { + ShaderVariant v; + v.sourcePath = std::move(path); + v.defines = std::move(defs); + v.stage = stage; + return v; + } + + bool isRuntime() const { return !sourcePath.empty() && precompiledKey.empty(); } + bool isPrecompiled() const { return !precompiledKey.empty(); } + }; + + // Layout: [bindOnce] [warmup x dispatchOne][ts0][bench x dispatchOne][ts1][cooldown x dispatchOne] + // Cooldown == warmup so the measured window isn't on a winding-down tail. + // Putting binds inside dispatchOne adds per-iteration cmdbuf overhead that + // shows up in ps/sample on tight shaders. + using DispatchFn = std::function; + + // Input choice for createBindings(). Output is always implicit BDA. + enum class InputBuffer : uint8_t + { + None, + BDA, + SSBO, + UBO, + }; + + struct BindingsConfig + { + size_t outputBytes = 0; + size_t pushConstantBytes = 0; + size_t inputBytes = 0; + InputBuffer inputMode = InputBuffer::None; + }; + + struct Bindings + { + nbl::core::smart_refctd_ptr outputBuf; + uint64_t outputAddress = 0; + nbl::core::smart_refctd_ptr pipelineLayout; + + nbl::core::smart_refctd_ptr inputBuf; + uint64_t inputAddress = 0; // BDA mode only + + nbl::core::smart_refctd_ptr dsLayout; + nbl::core::smart_refctd_ptr ds; + }; + + struct PipelineEntry + { + nbl::core::smart_refctd_ptr pipeline; + nbl::core::smart_refctd_ptr layout; + PipelineStats stats; + std::string tag; + }; + + // Common bindOnce body: bind pipeline + upload push constants. Most benches + // have nothing else in bindOnce; the few that bind descriptor sets too call + // cb->bindDescriptorSets() before/after this. + template + static void defaultBindAndPush(nbl::video::IGPUCommandBuffer* cb, const PipelineEntry& pe, const PC& pc) + { + cb->bindComputePipeline(pe.pipeline.get()); + cb->pushConstants(pe.layout.get(), nbl::asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PC), &pc); + } + + // Dispatch using m_dispatchGroupCount (the setup-time shape). + void defaultDispatch(nbl::video::IGPUCommandBuffer* cb) const + { + cb->dispatch(m_dispatchGroupCount.x, m_dispatchGroupCount.y, m_dispatchGroupCount.z); + } + + bool init(const InitData& data) + { + m_device = data.device; + m_logger = data.logger; + m_physicalDevice = data.physicalDevice; + m_queue = m_device->getQueue(data.computeFamilyIndex, 0); + m_dispatchGroupCount = data.dispatchGroupCount; + m_samplesPerDispatch = data.samplesPerDispatch; + + m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, + nbl::video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!m_cmdpool->createCommandBuffers(nbl::video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) + { + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "GPUBenchmarkHelper: failed to create cmdbuf"); + return false; + } + + nbl::video::IQueryPool::SCreationParams qparams = {}; + qparams.queryType = nbl::video::IQueryPool::TYPE::TIMESTAMP; + qparams.queryCount = 2; + qparams.pipelineStatisticsFlags = nbl::video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + m_queryPool = m_device->createQueryPool(qparams); + if (!m_queryPool) + { + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "GPUBenchmarkHelper: failed to create timestamp query pool"); + return false; + } + return true; + } + + // Load (precompiled path) or load+compile (runtime path) a variant's SPIRV. + nbl::core::smart_refctd_ptr loadShader(const ShaderVariant& variant, nbl::core::smart_refctd_ptr assetMgr) const + { + using namespace nbl; + if (!variant.isRuntime() && !variant.isPrecompiled()) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: variant has neither sourcePath nor precompiledKey"); + return nullptr; + } + + asset::IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + + std::string key; + if (variant.isPrecompiled()) + { + lp.workingDirectory = "app_resources"; + key = variant.precompiledKey; + } + else + { + lp.workingDirectory = ""; + key = "app_resources/" + variant.sourcePath; + } + auto bundle = assetMgr->getAsset(key, lp); + const auto assets = bundle.getContents(); + if (assets.empty()) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: failed to load '{}'", key); + return nullptr; + } + auto source = asset::IAsset::castDown(assets[0]); + if (!source) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: '{}' is not an IShader asset", key); + return nullptr; + } + + if (variant.isPrecompiled()) + return source; + + auto* compilerSet = assetMgr->getCompilerSet(); + auto compiler = compilerSet->getShaderCompiler(source->getContentType()); + if (!compiler) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: no compiler for content type of '{}'", variant.sourcePath); + return nullptr; + } + + std::vector wireDefines; + wireDefines.reserve(variant.defines.size()); + for (const auto& d : variant.defines) + wireDefines.push_back({d.identifier, d.definition}); + + asset::IShaderCompiler::SCompilerOptions options = {}; + options.stage = variant.stage; + options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); + options.preprocessorOptions.extraDefines = {wireDefines.data(), wireDefines.size()}; + + auto spirv = compilerSet->compileToSPIRV(source.get(), options); + if (!spirv) + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: runtime compile failed for '{}'", variant.sourcePath); + return spirv; + } + + nbl::core::smart_refctd_ptr allocateDeviceLocalBuffer(nbl::video::IGPUBuffer::SCreationParams bp, const char* label, + nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS allocFlags = nbl::video::IDeviceMemoryAllocation::EMAF_NONE) + { + auto buf = m_device->createBuffer(std::move(bp)); + auto reqs = buf->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + auto alloc = m_device->allocate(reqs, buf.get(), allocFlags); + if (!alloc.isValid()) + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "GPUBenchmarkHelper: failed to allocate {}", label); + return buf; + } + + struct SingleBindingDS + { + nbl::core::smart_refctd_ptr layout; + nbl::core::smart_refctd_ptr set; + }; + + SingleBindingDS createSingleBindingDS( + nbl::core::smart_refctd_ptr buffer, + nbl::asset::IDescriptor::E_TYPE type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + uint32_t binding = 0, + nbl::hlsl::ShaderStage stages = nbl::hlsl::ShaderStage::ESS_COMPUTE) + { + using namespace nbl; + const size_t bufferBytes = buffer->getSize(); + + video::IGPUDescriptorSetLayout::SBinding b = { + .binding = binding, + .type = type, + .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = stages, + .count = 1, + }; + SingleBindingDS out; + out.layout = m_device->createDescriptorSetLayout({&b, 1}); + auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, {&out.layout.get(), 1}); + out.set = pool->createDescriptorSet(core::smart_refctd_ptr(out.layout)); + + video::IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = std::move(buffer); + info.info.buffer = {.offset = 0, .size = bufferBytes}; + video::IGPUDescriptorSet::SWriteDescriptorSet w = { + .dstSet = out.set.get(), + .binding = binding, + .arrayElement = 0, + .count = 1, + .info = &info, + }; + m_device->updateDescriptorSets({&w, 1}, {}); + return out; + } + + nbl::core::smart_refctd_ptr createOutputBuffer( + size_t bytes, + nbl::core::bitflag extraUsage = nbl::video::IGPUBuffer::E_USAGE_FLAGS::EUF_NONE, + nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS allocFlags = nbl::video::IDeviceMemoryAllocation::EMAF_NONE) + { + nbl::video::IGPUBuffer::SCreationParams bp = {}; + bp.size = bytes; + bp.usage = nbl::core::bitflag(nbl::video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | extraUsage; + return allocateDeviceLocalBuffer(std::move(bp), "output buffer", allocFlags); + } + + // Buffer must have been created with EUF_TRANSFER_DST_BIT. + void submitFillZero(nbl::core::smart_refctd_ptr buf, size_t bytes) const + { + nbl::core::smart_refctd_ptr initCmdbuf; + m_cmdpool->createCommandBuffers(nbl::video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &initCmdbuf); + initCmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + const nbl::asset::SBufferRange range = {.offset = 0, .size = bytes, .buffer = std::move(buf)}; + initCmdbuf->fillBuffer(range, 0u); + initCmdbuf->end(); + + const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = {{.cmdbuf = initCmdbuf.get()}}; + nbl::video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = cmds; + m_queue->submit({&submit, 1u}); + m_device->waitIdle(); + } + + nbl::core::smart_refctd_ptr createInputBufferZeroFilled(size_t bytes) + { + auto buf = createOutputBuffer(bytes, nbl::video::IGPUBuffer::EUF_TRANSFER_DST_BIT); + if (buf) + submitFillZero(buf, bytes); + return buf; + } + + // BDA buffer staged into device-local VRAM via IUtilities. + nbl::core::smart_refctd_ptr createBdaBuffer(const void* srcData, size_t bytes) + { + using namespace nbl; + if (!m_utils) + m_utils = video::IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr(m_logger)); + + video::IGPUBuffer::SCreationParams bp = {}; + bp.size = bytes; + bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | video::IGPUBuffer::EUF_TRANSFER_DST_BIT; + core::smart_refctd_ptr buf; + auto future = m_utils->createFilledDeviceLocalBufferOnDedMem( + video::SIntendedSubmitInfo {.queue = m_queue}, std::move(bp), srcData); + future.move_into(buf); + return buf; + } + + uint32_t createPipeline(const ShaderVariant& variant, + nbl::core::smart_refctd_ptr assetMgr, + size_t pushConstantSize, + std::string tag = "", + nbl::core::smart_refctd_ptr dsLayout = nullptr) + { + using namespace nbl; + const uint32_t idx = uint32_t(m_pipelines.size()); + m_pipelines.push_back({.tag = tag}); + PipelineEntry& slot = m_pipelines.back(); + + const asset::SPushConstantRange pcRange = { + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = uint32_t(pushConstantSize), + }; + auto layout = dsLayout + ? m_device->createPipelineLayout({&pcRange, 1}, core::smart_refctd_ptr(dsLayout)) + : m_device->createPipelineLayout({&pcRange, 1}); + if (!layout) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): pipeline layout creation failed", tag); + return idx; + } + + auto source = loadShader(variant, std::move(assetMgr)); + auto shader = source ? m_device->compileShader({.source = source.get()}) : nullptr; + if (!shader) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): shader load/compile failed", tag); + return idx; + } + + video::IGPUComputePipeline::SCreationParams pp = {}; + pp.layout = layout.get(); + pp.shader.shader = shader.get(); + pp.shader.entryPoint = "main"; + if (m_device->getEnabledFeatures().pipelineExecutableInfo) + pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + + core::smart_refctd_ptr pipeline; + if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline) || !pipeline) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): createComputePipelines failed", tag); + return idx; + } + + if (m_device->getEnabledFeatures().pipelineExecutableInfo) + { + auto infos = pipeline->getExecutableInfo(); + slot.stats.raw = nbl::system::to_string(infos); + + uint64_t vgpr = 0, sgpr = 0; + for (const auto& info : infos) + { + if (info.subgroupSize) + slot.stats.subgroupSize = std::max(slot.stats.subgroupSize, info.subgroupSize); + for (const auto& stat : info.structuredStatistics) + matchStat(stat, slot.stats, vgpr, sgpr); + } + // AMD-style drivers expose VGPR/SGPR separately without a combined + // register count, so fall back to the sum. + if (slot.stats.registerCount == 0 && (vgpr || sgpr)) + slot.stats.registerCount = vgpr + sgpr; + + if (!slot.stats.raw.empty()) + benchLogFmt(m_logger.get(), system::ILogger::ELL_PERFORMANCE, "{} pipeline executable report:\n{}", tag, slot.stats.raw); + } + + slot.layout = std::move(layout); + slot.pipeline = std::move(pipeline); + return idx; + } + + Bindings createBindings(const BindingsConfig& cfg) + { + using namespace nbl; + Bindings out; + + out.outputBuf = createOutputBuffer(cfg.outputBytes, video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT, video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + out.outputAddress = out.outputBuf->getDeviceAddress(); + + if (cfg.inputMode != InputBuffer::None && cfg.inputBytes > 0) + { + const bool useBDA = cfg.inputMode == InputBuffer::BDA; + const bool useUBO = cfg.inputMode == InputBuffer::UBO; + const bool useSSBO = cfg.inputMode == InputBuffer::SSBO; + + video::IGPUBuffer::SCreationParams bp = {}; + bp.size = cfg.inputBytes; + bp.usage = core::bitflag(video::IGPUBuffer::EUF_TRANSFER_DST_BIT); + if (useBDA || useSSBO) + bp.usage |= video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + if (useBDA) + bp.usage |= video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + if (useUBO) + bp.usage |= video::IGPUBuffer::EUF_UNIFORM_BUFFER_BIT; + + out.inputBuf = allocateDeviceLocalBuffer(std::move(bp), "input buffer", + useBDA ? video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT : video::IDeviceMemoryAllocation::EMAF_NONE); + + if (useBDA) + out.inputAddress = out.inputBuf->getDeviceAddress(); + + submitFillZero(out.inputBuf, cfg.inputBytes); + + if (useSSBO || useUBO) + { + video::IGPUDescriptorSetLayout::SBinding b = { + .binding = 0, + .type = useSSBO ? asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER : asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER, + .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = nbl::hlsl::ShaderStage::ESS_COMPUTE, + .count = 1, + }; + out.dsLayout = m_device->createDescriptorSetLayout({&b, 1}); + + auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, {&out.dsLayout.get(), 1}); + out.ds = pool->createDescriptorSet(core::smart_refctd_ptr(out.dsLayout)); + + video::IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = core::smart_refctd_ptr(out.inputBuf); + info.info.buffer = {.offset = 0, .size = cfg.inputBytes}; + video::IGPUDescriptorSet::SWriteDescriptorSet w = { + .dstSet = out.ds.get(), + .binding = 0, + .arrayElement = 0, + .count = 1, + .info = &info, + }; + m_device->updateDescriptorSets({&w, 1}, {}); + } + } + + { + const asset::SPushConstantRange pc = { + .stageFlags = nbl::hlsl::ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = uint32_t(cfg.pushConstantBytes), + }; + std::span pcRange = cfg.pushConstantBytes > 0 ? std::span(&pc, 1) : std::span {}; + + if (out.dsLayout) + out.pipelineLayout = m_device->createPipelineLayout(pcRange, core::smart_refctd_ptr(out.dsLayout)); + else + out.pipelineLayout = m_device->createPipelineLayout(pcRange); + } + + return out; + } + + struct BdaBuffer + { + nbl::core::smart_refctd_ptr buf; + uint64_t address = 0; + }; + + BdaBuffer createBdaOutputBuffer(size_t bytes) + { + BdaBuffer out; + out.buf = createOutputBuffer(bytes, nbl::video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT, nbl::video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + out.address = out.buf ? out.buf->getDeviceAddress() : 0; + return out; + } + + // Auto-sizes the dispatch count so the measured window covers ~targetBudgetMs + // of GPU work. Pilots with a small N, then either scales to the budget or + // doubles when the pilot is too noisy (sub-millisecond) to extrapolate. + // + // `samples` controls jitter robustness: values >1 take K independent + // budget-sized timing windows and return the MEDIAN window, costing ~K * + // targetBudgetMs of wall time. Median (not min) is used because GPU + // measurement noise can be two-sided in practice. + TimingResult runTimedBudgeted(uint32_t warmupDispatches, uint64_t targetBudgetMs, const DispatchFn& bindOnce, const DispatchFn& dispatchOne, uint32_t samples) + { + const uint64_t targetBudgetNs = targetBudgetMs * 1'000'000ull; + constexpr uint32_t kPilotN = 64; + constexpr uint32_t kMaxN = 1u << 24; // safety cap for ultra-fast shaders + TimingResult r = runTimed(warmupDispatches, kPilotN, bindOnce, dispatchOne); + uint32_t lastN = kPilotN; + while (r.elapsed_ns < targetBudgetNs && lastN < kMaxN) + { + uint32_t nextN; + if (r.elapsed_ns > 1'000'000ull) // > 1 ms, stable enough to scale + { + const double scale = double(targetBudgetNs) / double(r.elapsed_ns); + nextN = uint32_t(std::min(double(kMaxN), std::ceil(double(lastN) * scale))); + } + else + { + nextN = std::min(kMaxN, lastN * 2); + } + if (nextN <= lastN) + break; // converged + r = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne); + lastN = nextN; + } + + if (samples <= 1) + return r; + + // Reuse the convergence's final measurement as one of the K samples + // (it's already a budget-sized window at lastN). Run K-1 more at the + // same N. All windows measure the same dispatch count, so the per-window + // elapsed_ns values are directly comparable. + std::vector ns; + ns.reserve(samples); + ns.push_back(r.elapsed_ns); + for (uint32_t i = 1; i < samples; ++i) + { + const TimingResult ri = runTimed(warmupDispatches, lastN, bindOnce, dispatchOne); + ns.push_back(ri.elapsed_ns); + } + std::sort(ns.begin(), ns.end()); + + // Outlier rejection: GPU jitter is usually a one-sided spike + const double median = ns[ns.size() / 2]; + const double dLow = median - ns.front(); + const double dHigh = ns.back() - median; + const double dCloser = std::min(dLow, dHigh); + const double dFar = std::max(dLow, dHigh); + size_t lo = 0; + size_t hi = ns.size(); + if (dCloser > 0.0 && dFar > 2.0 * dCloser) + { + if (dHigh > dLow) + --hi; // top sample is the spike + else + ++lo; // bottom sample is the spike (rare on GPU but cheap to handle) + } + + double sum = 0.0; + for (size_t i = lo; i < hi; ++i) + sum += ns[i]; + const double resultNs = sum / double(hi - lo); + + TimingResult m {}; + m.elapsed_ns = resultNs; + m.totalSamples = uint64_t(lastN) * m_samplesPerDispatch; + m.ps_per_sample = m.totalSamples ? resultNs * 1e3 / double(m.totalSamples) : 0.0; + m.gsamples_per_s = resultNs > 0.0 ? double(m.totalSamples) / resultNs : 0.0; + m.ms_total = resultNs * 1e-6; + return m; + } + + TimingResult runTimed(uint32_t warmupDispatches, uint32_t benchDispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne) + { + m_device->waitIdle(); + const uint32_t cooldownDispatches = warmupDispatches; + + m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); + + if (bindOnce) + bindOnce(m_cmdbuf.get()); + + for (uint32_t i = 0u; i < warmupDispatches; ++i) + dispatchOne(m_cmdbuf.get()); + + m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + for (uint32_t i = 0u; i < benchDispatches; ++i) + dispatchOne(m_cmdbuf.get()); + m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + + for (uint32_t i = 0u; i < cooldownDispatches; ++i) + dispatchOne(m_cmdbuf.get()); + m_cmdbuf->end(); + + auto semaphore = m_device->createSemaphore(0u); + const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_cmdbuf.get()}}; + const nbl::video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; + nbl::video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = benchCmds; + submit.signalSemaphores = signalSem; + m_queue->submit({&submit, 1u}); + + m_device->waitIdle(); + + uint64_t timestamps[2] = {}; + const auto flags = nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::_64_BIT) | nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); + + TimingResult r {}; + const double timestampPeriod = double(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); + r.elapsed_ns = double(timestamps[1] - timestamps[0]) * timestampPeriod; + r.totalSamples = uint64_t(benchDispatches) * m_samplesPerDispatch; + r.ps_per_sample = r.totalSamples ? r.elapsed_ns * 1e3 / double(r.totalSamples) : 0.0; + r.gsamples_per_s = r.elapsed_ns > 0.0 ? double(r.totalSamples) / r.elapsed_ns : 0.0; + r.ms_total = r.elapsed_ns * 1e-6; + return r; + } + +protected: + std::vector m_pipelines; + +private: + static void matchStat(const nbl::video::IGPUPipelineBase::SExecutableStatistic& stat, PipelineStats& out, uint64_t& vgpr, uint64_t& sgpr) + { + const uint64_t v = stat.asUint(); + + auto contains = [&](std::string_view kw) + { + const auto it = std::ranges::search(stat.name, kw, + [&](char a, char b) + { return std::tolower(a) == std::tolower(b); }) + .begin(); + return it != stat.name.end(); + }; + + // Order matters: more specific keys first. + + if (contains("subgroup size") || contains("subgroupsize") || contains("warp size") || contains("wave size")) + out.subgroupSize = std::max(out.subgroupSize, uint32_t(v)); + + else if (contains("vgpr")) + vgpr = std::max(vgpr, v); + else if (contains("sgpr")) + sgpr = std::max(sgpr, v); + else if (contains("register")) + out.registerCount = std::max(out.registerCount, v); + + else if (contains("binary size") || contains("binarysize") || contains("codesize") || contains("code size") || contains("isa size")) + out.codeSizeBytes = std::max(out.codeSizeBytes, v); + else if (contains("instructioncount") || contains("instruction count") || contains("numinstructions")) + out.codeSizeBytes = std::max(out.codeSizeBytes, v); // proxy when no byte size + + else if (contains("shared memory") || contains("sharedmemory") || contains("groupshared") || contains("lds")) + out.sharedMemBytes = std::max(out.sharedMemBytes, v); + + else if (contains("stack size") || contains("stacksize")) + out.stackBytes = std::max(out.stackBytes, v); + + else if (contains("local memory") || contains("localmemory") || contains("scratch") || contains("private memory") || contains("privatememory") || contains("stack")) + out.privateMemBytes = std::max(out.privateMemBytes, v); + + // Vendor-specific stats + // get a structured copy so JSON round-trips the right numeric type. + else + out.unknowns.push_back(stat); + } + + nbl::core::smart_refctd_ptr m_device; + nbl::core::smart_refctd_ptr m_logger; + nbl::video::IPhysicalDevice* m_physicalDevice = nullptr; + nbl::video::IQueue* m_queue = nullptr; + nbl::hlsl::uint32_t3 m_dispatchGroupCount {}; + uint64_t m_samplesPerDispatch = 0; + nbl::core::smart_refctd_ptr m_cmdpool; + nbl::core::smart_refctd_ptr m_cmdbuf; + nbl::core::smart_refctd_ptr m_queryPool; + nbl::core::smart_refctd_ptr m_utils; // lazy, only built on first createBdaBuffer call +}; + +#endif diff --git a/common/include/nbl/examples/Benchmark/IBenchmark.h b/common/include/nbl/examples/Benchmark/IBenchmark.h new file mode 100644 index 000000000..93493c2c6 --- /dev/null +++ b/common/include/nbl/examples/Benchmark/IBenchmark.h @@ -0,0 +1,409 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_I_BENCHMARK_INCLUDED_ +#define _NBL_COMMON_I_BENCHMARK_INCLUDED_ + +#include +#include "nbl/examples/Benchmark/BenchmarkTypes.h" +#include "nbl/examples/Benchmark/BenchmarkConsole.h" +#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h" +#include "nbl/examples/Benchmark/BenchmarkJson.h" +#include "nbl/examples/Benchmark/BenchmarkCli.h" +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + + +struct RunContext +{ + WorkloadShape shape; + uint64_t targetBudgetMs = 400; // wall-clock budget per row + std::string sectionLabel = "Benchmarks"; +}; + +// Typical use: +// +// Aggregator agg(logger, logicalDevice, physicalDevice, computeFamilyIndex); +// agg.applyCli({.argv = argv, .defaultOutputPath = "Bench.json"}); +// const RunContext myCtx{.shape = ..., .targetBudgetMs = 400, .sectionLabel = "..."}; +// std::vector benches; +// for (...) benches.emplace_back(agg, MyBench::SetupData{...}); +// MyOtherBench other(agg, MyOtherBench::SetupData{...}); +// agg.runSessionAndReport( +// Aggregator::Span{std::span(benches), myCtx}, +// Aggregator::Span{std::span(&other, 1), otherCtx}); +class Aggregator +{ + friend class IBenchmark; + +public: + Aggregator() = default; + + Aggregator(nbl::core::smart_refctd_ptr logger, + nbl::core::smart_refctd_ptr logicalDevice, + nbl::video::IPhysicalDevice* physicalDevice, + uint32_t computeFamilyIndex) + { + m_console.setLogger(std::move(logger)); + m_logicalDevice = std::move(logicalDevice); + m_physicalDevicePtr = physicalDevice; + m_computeFamilyIndex = computeFamilyIndex; + setDevice(physicalDevice); + } + + void setSilent(bool silent) { m_console.setSilent(silent); } + + const nbl::core::smart_refctd_ptr& getLogicalDevice() const { return m_logicalDevice; } + nbl::video::IPhysicalDevice* getPhysicalDevice() const { return m_physicalDevicePtr; } + uint32_t getComputeFamilyIndex() const { return m_computeFamilyIndex; } + nbl::core::smart_refctd_ptr getLogger() const + { + return nbl::core::smart_refctd_ptr(m_console.getLogger()); + } + + bool loadBaseline(std::string label, const std::string& path) + { + auto b = benchmark_json::loadBaselineFile(label, path); + if (!b) + return false; + + for (const auto& [_, row] : b->rowsByName) + m_console.growForBaseline(row); + + // Vector (not map) so delta columns print in load order. + auto it = std::find_if(m_baselines.begin(), m_baselines.end(), + [&](const Baseline& existing) { return existing.label == label; }); + if (it != m_baselines.end()) + *it = std::move(*b); + else + m_baselines.push_back(std::move(*b)); + return true; + } + + bool loadBaseline(const std::string& path) { return loadBaseline("baseline", path); } + + bool writeReport(const std::string& path) + { + size_t preservedCount = 0; + if (!benchmark_json::writeReportFile(path, m_device, m_baselines, m_results, m_console.getLogger(), &preservedCount)) + return false; + + if (preservedCount > 0) + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "Wrote benchmark report to {} ({} new + {} preserved from prior file)", + path, m_results.size(), preservedCount); + else + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "Wrote benchmark report to {} ({} rows)", path, m_results.size()); + return true; + } + + // Captured for the UUID-mismatch warning in applyCli. + void setDevice(const nbl::video::IPhysicalDevice* dev) { m_device = benchmark_json::buildDeviceMetadata(dev); } + + struct CliResult + { + std::string outputPath; + nbl::core::vector> focusVariants; + uint32_t focusSamples = 3; // --focus-samples, see samplesForCurrentRow + + bool isFocused(const nbl::core::vector& name) const + { + return std::ranges::find(focusVariants, name) != focusVariants.end(); + } + }; + + template + struct Span + { + std::span benches; + RunContext context; + }; + + // Two overloads so a single bench doesn't need `std::span(&bench, 1)`. + template + requires requires (Range& r) { std::data(r); std::size(r); } + static auto makeSpan(Range& benches, RunContext context) + { + using T = std::remove_reference_t; + return Span{std::span(std::data(benches), std::size(benches)), std::move(context)}; + } + + template + requires std::derived_from + static Span makeSpan(T& bench, RunContext context) + { + return Span{std::span(&bench, 1), std::move(context)}; + } + + static std::string describe(const RunContext& ctx) + { + const auto& sh = ctx.shape; + const uint32_t wgThreads = sh.workgroupSize.x * sh.workgroupSize.y * sh.workgroupSize.z; + const uint32_t threadsPerDisp = sh.dispatchGroupCount.x * sh.dispatchGroupCount.y * sh.dispatchGroupCount.z * wgThreads; + const uint64_t itersPerThread = threadsPerDisp ? sh.samplesPerDispatch / threadsPerDisp : 0; + const double budgetMs = double(ctx.targetBudgetMs); + return std::format("=== {} (~{:.0f}ms/row, {} threads/dispatch, {} iters/thread; wg={}x{}x{}; ps/sample is per all GPU threads) ===", + ctx.sectionLabel, budgetMs, threadsPerDisp, itersPerThread, sh.workgroupSize.x, sh.workgroupSize.y, sh.workgroupSize.z); + } + + // Order: banner -> focus(spans...) -> comparison table -> banner -> + // column header -> rest(spans...) -> writeReport. + // All focus rows print globally first, then all rest rows; banner printed + // twice so each chunk reads in isolation when scrolling back. + template + requires(std::derived_from && ...) + void runSessionAndReport(Span... spans) + { + // Templated lambda (not `auto& s`) so only Span deduces -- a future + // signature change can't silently start passing arbitrary types through. + auto runSpan = [this](Span& s, bool silent) + { + if (s.benches.empty()) + return; + if (!silent) + { + m_console.logSectionBanner(describe(s.context)); + m_console.logHeader(m_baselines); + } + for (auto& e : s.benches) + e.run(); + // Flush after each rest span: if span N+1 dies mid-way, span N's + // rows are already on disk. Trailing flush is also the final write. + if (!silent) + writeReport(m_cli.outputPath); + }; + + m_console.logBannerNotes(m_baselines); + if (!m_cli.focusVariants.empty()) + { + m_console.setSilent(true); // benches read this to know they're in the focused-rows half + (runSpan(spans, true), ...); + m_console.setSilent(false); + m_console.printBaselineComparison(std::span>(m_focusNames), m_baselines, m_results); + } + (runSpan(spans, false), ...); + } + + struct CliConfig + { + std::span argv; // feed from IApplicationFramework::argv + std::string defaultOutputPath = "Bench.json"; + std::string appName = "benchmark"; + }; + + CliResult applyCli(const CliConfig& cfg) + { + auto parsed = benchmark_cli::parseArgs(cfg.argv, cfg.defaultOutputPath); + if (parsed.helpRequested) + { + benchmark_cli::printHelp(m_console.getLogger(), cfg.appName, cfg.defaultOutputPath); + exit(0); + } + if (parsed.noColor) + m_console.setColorEnabled(false); + + CliResult res; + res.outputPath = parsed.outputPath; + + if (!parsed.baselines.empty()) + { + size_t succeeded = 0; + for (const auto& [label, path] : parsed.baselines) + { + if (loadBaseline(label, path)) + { + ++succeeded; + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "Loaded baseline '{}' from {} ({} rows)", label, path, m_baselines.back().rowsByName.size()); + } + else + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING, + "Failed to load baseline '{}' from {}, skipped", label, path); + } + if (succeeded == 0) + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING, + "All {} --baseline load(s) failed. delta columns and --focus will be empty. " + "Check the paths above; default auto-load of '{}' is suppressed once any --baseline is specified, " + "drop the --baseline flag(s) or use --no-baseline to silence this warning.", + parsed.baselines.size(), res.outputPath); + else if (succeeded < parsed.baselines.size()) + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING, + "{} of {} --baseline load(s) failed; continuing with {} loaded.", + parsed.baselines.size() - succeeded, parsed.baselines.size(), succeeded); + } + else if (!parsed.noBaseline) + { + if (loadBaseline(res.outputPath)) + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "Loaded baseline from {} ({} rows)", res.outputPath, + m_baselines.empty() ? size_t {0} : m_baselines.back().rowsByName.size()); + else + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "No baseline at {}, delta column will read 'n/a'", res.outputPath); + } + + warnDeviceMismatch(); + + res.focusVariants = std::move(parsed.focus); + res.focusSamples = parsed.focusSamples; + m_cli = res; + return res; + } + +private: + void warnDeviceMismatch() const + { + if (!m_device.is_object() || !m_device.contains("deviceUUID")) + return; + const auto& currentUUID = m_device["deviceUUID"]; + for (const auto& b : m_baselines) + { + if (!b.device.is_object() || !b.device.contains("deviceUUID")) + continue; + if (b.device["deviceUUID"] == currentUUID) + continue; + const std::string baselineDevName = b.device.value("name", std::string {""}); + const std::string currentDevName = m_device.value("name", std::string {""}); + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING, + "Baseline '{}' (from {}) was measured on a different GPU ('{}' vs current '{}'). " + "Delta values will be apples-to-oranges.", + b.label, b.path, baselineDevName, currentDevName); + } + } + + // In focus phase (silent), captures the row's name into m_focusNames so + // runSessionAndReport can build the comparison table without main.cpp + // threading names back through each bench class. + void appendAndLog(Result&& r) + { + const std::string joined = joinName(r.name); + if (!m_baselines.empty()) + { + const std::string key = makeKey(r.name); + for (const auto& b : m_baselines) + { + auto it = b.rowsByName.find(key); + if (it == b.rowsByName.end()) + continue; + const bool shapeMismatch = r.workload.present() && it->second.workload.present() && (r.workload.shape != it->second.workload.shape); + r.baselines[b.label] = {it->second.psPerSample, shapeMismatch}; + } + } + m_console.growWidthFor(joined); + if (m_console.silent()) + m_focusNames.push_back(r.name); + m_results.push_back(std::move(r)); + m_console.logRow(std::span(m_results.back().name), joined, m_results.back().timing, m_results.back().stats, m_results.back().baselines, m_baselines); + } + + std::vector m_results; + std::vector m_baselines; + nbl::core::vector> m_focusNames; + nlohmann::json m_device; + CliResult m_cli; + BenchmarkConsole m_console; + nbl::core::smart_refctd_ptr m_logicalDevice; + nbl::video::IPhysicalDevice* m_physicalDevicePtr = nullptr; + uint32_t m_computeFamilyIndex = 0; +}; + +class IBenchmark +{ +public: + virtual ~IBenchmark() = default; + + // Single-named benches override doRun() and inherit this default filter. + // Sweep-style benches synthesize per-row names; they override run() and + // do per-row filtering themselves. + virtual void run() + { + const bool silent = isFocusPhase(); + const bool inFocus = isFocused(m_name); + const bool shouldRun = silent ? inFocus : !inFocus; + if (shouldRun) + doRun(); + } + + uint32_t getWarmupDispatches() const { return m_warmupDispatches; } + uint64_t getTargetBudgetMs() const { return m_targetBudgetMs; } + const WorkloadShape& getShape() const { return m_workloadShape; } + + // Pass this to runTimedBudgeted so only --focus rows pay the K * budget cost. + uint32_t samplesForCurrentRow() const { return isFocusPhase() ? m_aggregator.m_cli.focusSamples : 1u; } + +protected: + // Banner label is NOT taken here; it belongs to the span (see Aggregator::Span). + IBenchmark(Aggregator& aggregator, core::vector name, uint32_t warmupDispatches, const WorkloadShape& shape, uint64_t targetBudgetMs) + : m_name(std::move(name)) + , m_aggregator(aggregator) + , m_warmupDispatches(warmupDispatches) + , m_targetBudgetMs(targetBudgetMs) + , m_workloadShape(shape) + { + registerVariant(m_name); + } + + virtual void doRun() {} + + bool isFocusPhase() const { return m_aggregator.m_console.silent(); } + bool isFocused(const core::vector& name) const { return m_aggregator.m_cli.isFocused(name); } + void registerVariant(std::span name) { m_aggregator.m_console.registerVariant(name); } + void registerVariant(std::initializer_list name) { m_aggregator.m_console.registerVariant(name); } + + void record(core::vector name, const TimingResult& t, const PipelineStats& s) + { + Workload w{.shape = m_workloadShape}; + w.benchDispatches = w.shape.samplesPerDispatch ? uint32_t(t.totalSamples / w.shape.samplesPerDispatch) : 0; + + Result r; + r.name = std::move(name); + r.timing = t; + r.stats = s; + r.workload = w; + m_aggregator.appendAndLog(std::move(r)); + } + + core::vector m_name; + Aggregator& m_aggregator; // non-owning, outlives this bench + uint32_t m_warmupDispatches; + uint64_t m_targetBudgetMs; + WorkloadShape m_workloadShape; +}; + +class GPUBenchmark : public IBenchmark, public GPUBenchmarkHelper +{ +public: + struct SetupData + { + core::vector name; + uint32_t warmupDispatches = 0; + WorkloadShape shape = {}; + uint64_t targetBudgetMs = 400; + }; + +protected: + GPUBenchmark(Aggregator& aggregator, const SetupData& data) + : IBenchmark(aggregator, data.name, data.warmupDispatches, data.shape, data.targetBudgetMs) + { + GPUBenchmarkHelper::init({ + .device = aggregator.getLogicalDevice(), + .logger = aggregator.getLogger(), + .physicalDevice = aggregator.getPhysicalDevice(), + .computeFamilyIndex = aggregator.getComputeFamilyIndex(), + .dispatchGroupCount = data.shape.dispatchGroupCount, + .samplesPerDispatch = data.shape.samplesPerDispatch, + }); + } +}; + +#endif From 429ed1965412d41027d9fa85ad31573887098a3d Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 13 May 2026 11:04:03 +0300 Subject: [PATCH 2/4] fixed a typo --- 64_EmulatedFloatTest/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp index 352e4e61f..11a6d2f55 100644 --- a/64_EmulatedFloatTest/main.cpp +++ b/64_EmulatedFloatTest/main.cpp @@ -99,9 +99,9 @@ class CEF64Benchmark : public GPUBenchmark [&](IGPUCommandBuffer* cb) { cb->bindDescriptorSets(EPBP_COMPUTE, pe.layout.get(), 0, 1, &m_ds.get()); - bindAndPush(cb, pe, pc); + defaultBindAndPush(cb, pe, pc); }, - [this](IGPUCommandBuffer* cb) { dispatch(cb); }, + [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); }, samplesForCurrentRow()); record(m_name, t, pe.stats); From 2a47606db6dcdcca3ff6f137a4dce8fda2f78edd Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 17 May 2026 12:49:36 +0200 Subject: [PATCH 3/4] Fix GPU benchmark submit chunking --- .../examples/Benchmark/GPUBenchmarkHelper.h | 151 +++++++++++++----- 1 file changed, 115 insertions(+), 36 deletions(-) diff --git a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h index b5bd69a13..3cd5d7a91 100644 --- a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h +++ b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h @@ -12,7 +12,9 @@ #include "nbl/asset/utils/IShaderCompiler.h" #include +#include #include +#include #include #include #include @@ -71,8 +73,8 @@ class GPUBenchmarkHelper bool isPrecompiled() const { return !precompiledKey.empty(); } }; - // Layout: [bindOnce] [warmup x dispatchOne][ts0][bench x dispatchOne][ts1][cooldown x dispatchOne] - // Cooldown == warmup so the measured window isn't on a winding-down tail. + // Logical layout: [warmup x dispatchOne][ts0][bench x dispatchOne][ts1][cooldown x dispatchOne] + // Warmup/cooldown can be split into shorter submissions and the measured window stays intact. // Putting binds inside dispatchOne adds per-iteration cmdbuf overhead that // shows up in ps/sample on tight shaders. using DispatchFn = std::function; @@ -513,8 +515,22 @@ class GPUBenchmarkHelper const uint64_t targetBudgetNs = targetBudgetMs * 1'000'000ull; constexpr uint32_t kPilotN = 64; constexpr uint32_t kMaxN = 1u << 24; // safety cap for ultra-fast shaders - TimingResult r = runTimed(warmupDispatches, kPilotN, bindOnce, dispatchOne); + uint32_t dispatchesPerSubmit = 1u; + TimingResult r = runTimed(warmupDispatches, kPilotN, bindOnce, dispatchOne, dispatchesPerSubmit); + dispatchesPerSubmit = estimateDispatchesPerSubmit(r, kPilotN); uint32_t lastN = kPilotN; + while (r.elapsed_ns > targetBudgetNs && lastN > 1u) + { + const double scale = double(targetBudgetNs) / r.elapsed_ns; + uint32_t nextN = uint32_t(std::max(1.0, std::floor(double(lastN) * scale))); + if (nextN >= lastN) + nextN = lastN - 1u; + + r = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne, dispatchesPerSubmit); + dispatchesPerSubmit = estimateDispatchesPerSubmit(r, nextN); + lastN = nextN; + } + while (r.elapsed_ns < targetBudgetNs && lastN < kMaxN) { uint32_t nextN; @@ -529,8 +545,9 @@ class GPUBenchmarkHelper } if (nextN <= lastN) break; // converged - r = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne); - lastN = nextN; + r = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne, dispatchesPerSubmit); + dispatchesPerSubmit = estimateDispatchesPerSubmit(r, nextN); + lastN = nextN; } if (samples <= 1) @@ -545,7 +562,7 @@ class GPUBenchmarkHelper ns.push_back(r.elapsed_ns); for (uint32_t i = 1; i < samples; ++i) { - const TimingResult ri = runTimed(warmupDispatches, lastN, bindOnce, dispatchOne); + const TimingResult ri = runTimed(warmupDispatches, lastN, bindOnce, dispatchOne, dispatchesPerSubmit); ns.push_back(ri.elapsed_ns); } std::sort(ns.begin(), ns.end()); @@ -580,48 +597,53 @@ class GPUBenchmarkHelper return m; } - TimingResult runTimed(uint32_t warmupDispatches, uint32_t benchDispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne) + TimingResult runTimed(uint32_t warmupDispatches, uint32_t benchDispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne, uint32_t maxDispatchesPerSubmit) { - m_device->waitIdle(); + if (m_device->waitIdle() != nbl::video::IQueue::RESULT::SUCCESS) + return {}; + const uint32_t cooldownDispatches = warmupDispatches; - m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); + if (!runUntimedDispatches(warmupDispatches, bindOnce, dispatchOne, maxDispatchesPerSubmit)) + return {}; - if (bindOnce) - bindOnce(m_cmdbuf.get()); + double elapsedNs = 0.0; + uint32_t remaining = benchDispatches; + while (remaining > 0u) + { + const uint32_t batch = std::min(remaining, std::max(1u, maxDispatchesPerSubmit)); - for (uint32_t i = 0u; i < warmupDispatches; ++i) - dispatchOne(m_cmdbuf.get()); + m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); - for (uint32_t i = 0u; i < benchDispatches; ++i) - dispatchOne(m_cmdbuf.get()); - m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + if (bindOnce) + bindOnce(m_cmdbuf.get()); - for (uint32_t i = 0u; i < cooldownDispatches; ++i) - dispatchOne(m_cmdbuf.get()); - m_cmdbuf->end(); + m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + for (uint32_t i = 0u; i < batch; ++i) + dispatchOne(m_cmdbuf.get()); + m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + m_cmdbuf->end(); - auto semaphore = m_device->createSemaphore(0u); - const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_cmdbuf.get()}}; - const nbl::video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = 1u, .stageMask = nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - nbl::video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = benchCmds; - submit.signalSemaphores = signalSem; - m_queue->submit({&submit, 1u}); + if (!submitAndWait()) + return {}; - m_device->waitIdle(); + uint64_t timestamps[2] = {}; + const auto flags = nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::_64_BIT) | nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::WAIT_BIT); + if (!m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags)) + return {}; + + const double timestampPeriod = double(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); + elapsedNs += double(timestamps[1] - timestamps[0]) * timestampPeriod; + remaining -= batch; + } - uint64_t timestamps[2] = {}; - const auto flags = nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::_64_BIT) | nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::WAIT_BIT); - m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); + if (!runUntimedDispatches(cooldownDispatches, bindOnce, dispatchOne, maxDispatchesPerSubmit)) + return {}; TimingResult r {}; - const double timestampPeriod = double(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); - r.elapsed_ns = double(timestamps[1] - timestamps[0]) * timestampPeriod; + r.elapsed_ns = elapsedNs; r.totalSamples = uint64_t(benchDispatches) * m_samplesPerDispatch; r.ps_per_sample = r.totalSamples ? r.elapsed_ns * 1e3 / double(r.totalSamples) : 0.0; r.gsamples_per_s = r.elapsed_ns > 0.0 ? double(r.totalSamples) / r.elapsed_ns : 0.0; @@ -633,6 +655,63 @@ class GPUBenchmarkHelper std::vector m_pipelines; private: + // Soft target for one queue submit, estimated from timings on the current GPU. + // Benchmark budgets still control measured work. This only chunks submits. + static constexpr double SubmitChunkTargetNs = 250'000'000.0; + + static uint32_t estimateDispatchesPerSubmit(const TimingResult& r, uint32_t dispatches) + { + if (dispatches == 0u || r.elapsed_ns <= 0.0) + return 1u; + + const double nsPerDispatch = r.elapsed_ns / double(dispatches); + if (nsPerDispatch <= 0.0) + return 1u; + + const double maxDispatches = std::floor(SubmitChunkTargetNs / nsPerDispatch); + return uint32_t(std::clamp(maxDispatches, 1.0, double(std::numeric_limits::max()))); + } + + bool submitAndWait() + { + auto semaphore = m_device->createSemaphore(0u); + if (!semaphore) + return false; + + const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = {{.cmdbuf = m_cmdbuf.get()}}; + const nbl::video::IQueue::SSubmitInfo::SSemaphoreInfo done[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = nbl::asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS}}; + nbl::video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = cmds; + submit.signalSemaphores = done; + if (m_queue->submit({&submit, 1u}) != nbl::video::IQueue::RESULT::SUCCESS) + return false; + + const nbl::video::ISemaphore::SWaitInfo wait[] = {{.semaphore = semaphore.get(), .value = 1u}}; + return m_device->blockForSemaphores(wait) == nbl::video::ISemaphore::WAIT_RESULT::SUCCESS; + } + + bool runUntimedDispatches(uint32_t dispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne, uint32_t maxDispatchesPerSubmit) + { + while (dispatches > 0u) + { + const uint32_t batch = std::min(dispatches, std::max(1u, maxDispatchesPerSubmit)); + + m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + if (bindOnce) + bindOnce(m_cmdbuf.get()); + for (uint32_t i = 0u; i < batch; ++i) + dispatchOne(m_cmdbuf.get()); + m_cmdbuf->end(); + + if (!submitAndWait()) + return false; + dispatches -= batch; + } + return true; + } + static void matchStat(const nbl::video::IGPUPipelineBase::SExecutableStatistic& stat, PipelineStats& out, uint64_t& vgpr, uint64_t& sgpr) { const uint64_t v = stat.asUint(); From f8c9b02fcd45727a864290cf993a07144b87d1e3 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 18 May 2026 10:45:22 +0200 Subject: [PATCH 4/4] Fix benchmark sync and failed reruns --- .../benchmarks/CDiscreteSamplerBenchmark.h | 10 +- .../benchmarks/CSamplerBenchmark.h | 8 +- 37_HLSLSamplingTests/main.cpp | 133 +++++-- .../tests/SamplerTestHelpers.h | 4 +- .../tests/property/CSamplerPropertyTester.h | 20 +- 64_EmulatedFloatTest/main.cpp | 12 +- .../nbl/examples/Benchmark/BenchmarkJson.h | 49 ++- .../examples/Benchmark/GPUBenchmarkHelper.h | 24 +- .../nbl/examples/Tester/FailureManifest.h | 331 ++++++++++++++++++ common/include/nbl/examples/Tester/ITester.h | 32 +- 10 files changed, 551 insertions(+), 72 deletions(-) create mode 100644 common/include/nbl/examples/Tester/FailureManifest.h diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h index d6289f54b..f12ba9421 100644 --- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h @@ -198,7 +198,9 @@ class CDiscreteSamplerBenchmark : public GPUBenchmark // Pipeline + push constants are bound *once* in bindOnce, the inner loop is just // dispatch(...). Putting binds inside dispatchOne would inflate ps/sample on the // tighter samplers. - const PipelineEntry& pe = m_pipelines[m_pipelineIdx[size_t(kind)]]; + const PipelineEntry* pe = getPipelineEntry(m_pipelineIdx[size_t(kind)], joinName(name)); + if (!pe) + return; const TimingResult timingResult = runTimedBudgeted(warmupIterations, getTargetBudgetMs(), [&](IGPUCommandBuffer* cb) @@ -210,7 +212,7 @@ class CDiscreteSamplerBenchmark : public GPUBenchmark pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); pc.outputAddress = m_outputBuf->getDeviceAddress(); pc.tableSize = m_aliasTableN; - defaultBindAndPush(cb, pe, pc); + defaultBindAndPush(cb, *pe, pc); } else { @@ -219,13 +221,13 @@ class CDiscreteSamplerBenchmark : public GPUBenchmark pc.cumProbAddress = buf->getDeviceAddress(); pc.outputAddress = m_outputBuf->getDeviceAddress(); pc.tableSize = N; - defaultBindAndPush(cb, pe, pc); + defaultBindAndPush(cb, *pe, pc); } }, [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); }, samplesForCurrentRow()); - record(std::move(name), timingResult, pe.stats); + record(std::move(name), timingResult, pe->stats); } core::smart_refctd_ptr m_assetMgr; diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h index dd7149829..7410b7242 100644 --- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h @@ -39,16 +39,18 @@ class CSamplerBenchmark : public GPUBenchmark void doRun() override { - const PipelineEntry& pe = m_pipelines[m_pipelineIdx]; + const PipelineEntry* pe = getPipelineEntry(m_pipelineIdx, joinName(m_name)); + if (!pe) + return; SamplerBenchPushConstants pc = {}; pc.outputAddress = m_outputAddress; const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(), - [&](video::IGPUCommandBuffer* cb) { defaultBindAndPush(cb, pe, pc); }, + [&](video::IGPUCommandBuffer* cb) { defaultBindAndPush(cb, *pe, pc); }, [this](video::IGPUCommandBuffer* cb) { defaultDispatch(cb); }, samplesForCurrentRow()); - record(m_name, t, pe.stats); + record(m_name, t, pe->stats); } private: diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp index 634e84123..1c3f6000d 100644 --- a/37_HLSLSamplingTests/main.cpp +++ b/37_HLSLSamplingTests/main.cpp @@ -52,6 +52,7 @@ using namespace nbl::examples; #include "benchmarks/CSamplerBenchmark.h" #include "benchmarks/CDiscreteSamplerBenchmark.h" +#include "nbl/examples/Tester/FailureManifest.h" #include "tests/property/CSamplerPropertyTester.h" @@ -189,6 +190,12 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat m_logger->log("All sampling concept tests passed.", ILogger::ELL_INFO); + const auto runControl = nbl::examples::testing::parseRunControl(this->argv, m_logger.get()); + if (!runControl.valid) + return false; + + nbl::examples::testing::FailureManifest failureManifest("37_HLSLSamplingTests"); + // ====================================================================== // GPU throughput benchmarks // ====================================================================== @@ -197,6 +204,12 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat if constexpr (DoBenchmark) { + if (runControl.skipBenchmarks) + { + m_logger->log("Skipping benchmark phase due to CLI.", ILogger::ELL_INFO); + } + else + { constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; constexpr uint32_t totalThreadsPerDispatch = benchWorkgroupsCount * benchWorkgroupSize; constexpr uint32_t iterationsPerThread = BENCH_ITERS; @@ -346,6 +359,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat Aggregator::makeSpan(benchmarks, samplerCtx), Aggregator::makeSpan(discreteBench, discreteCtx)); } + } } // ================================================================ @@ -353,9 +367,16 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ================================================================ bool pass = true; constexpr uint32_t testWorkgroupsCount = 4096; + bool samplerPass = true; // generic lambda to run a GPU sampler test - auto runSamplerTest = [&](const char* testName, const char* logFile) + auto runSamplerTest = [&](const char* id, const char* testName, const char* logFile) { + if (!runControl.filter.shouldRun(id)) + { + m_logger->log("Skipping %s tests due to filter.", ILogger::ELL_INFO, testName); + return; + } + m_logger->log("Running %s tests...", ILogger::ELL_INFO, testName); typename Tester::PipelineSetupData data; data.device = m_device; @@ -367,43 +388,58 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat data.shaderKey = std::move(nbl::this_example::builtin::build::get_spirv_key(m_device.get())); Tester tester(testWorkgroupsCount); tester.setupPipeline(data); - pass &= tester.performTestsAndVerifyResults(logFile); + if (const auto seed = runControl.filter.seedFor(id); seed.has_value()) + tester.setSeed(*seed); + tester.setFailureRecordContext(&failureManifest, "sampler", id, testName); + samplerPass &= tester.performTestsAndVerifyResults(logFile); }; // --- Sampler tests --- if constexpr (true) { - runSamplerTest.operator()("Linear sampler", "LinearTestLog.txt"); - runSamplerTest.operator()("Bilinear sampler", "BilinearTestLog.txt"); - runSamplerTest.operator()("UniformHemisphere sampler", "UniformHemisphereTestLog.txt"); - runSamplerTest.operator()("UniformSphere sampler", "UniformSphereTestLog.txt"); - runSamplerTest.operator()("ProjectedHemisphere sampler", "ProjectedHemisphereTestLog.txt"); - runSamplerTest.operator()("ProjectedSphere sampler", "ProjectedSphereTestLog.txt"); - runSamplerTest.operator()("ConcentricMapping sampler", "ConcentricMappingTestLog.txt"); - runSamplerTest.operator()("PolarMapping sampler", "PolarMappingTestLog.txt"); - runSamplerTest.operator()("BoxMullerTransform sampler", "BoxMullerTransformTestLog.txt"); - runSamplerTest.operator()("SphericalTriangle", "SphericalTriangleTestLog.txt"); - runSamplerTest.operator()("ProjectedSphericalTriangle sampler", "ProjectedSphericalTriangleTestLog.txt"); - runSamplerTest.operator()("SphericalRectangle sampler", "SphericalRectangleTestLog.txt"); - runSamplerTest.operator()("ProjectedSphericalRectangle sampler", "ProjectedSphericalRectangleTestLog.txt"); + runSamplerTest.operator()("sampler/Linear", "Linear sampler", "LinearTestLog.txt"); + runSamplerTest.operator()("sampler/Bilinear", "Bilinear sampler", "BilinearTestLog.txt"); + runSamplerTest.operator()("sampler/UniformHemisphere", "UniformHemisphere sampler", "UniformHemisphereTestLog.txt"); + runSamplerTest.operator()("sampler/UniformSphere", "UniformSphere sampler", "UniformSphereTestLog.txt"); + runSamplerTest.operator()("sampler/ProjectedHemisphere", "ProjectedHemisphere sampler", "ProjectedHemisphereTestLog.txt"); + runSamplerTest.operator()("sampler/ProjectedSphere", "ProjectedSphere sampler", "ProjectedSphereTestLog.txt"); + runSamplerTest.operator()("sampler/ConcentricMapping", "ConcentricMapping sampler", "ConcentricMappingTestLog.txt"); + runSamplerTest.operator()("sampler/PolarMapping", "PolarMapping sampler", "PolarMappingTestLog.txt"); + runSamplerTest.operator()("sampler/BoxMullerTransform", "BoxMullerTransform sampler", "BoxMullerTransformTestLog.txt"); + runSamplerTest.operator()("sampler/SphericalTriangle", "SphericalTriangle", "SphericalTriangleTestLog.txt"); + runSamplerTest.operator()("sampler/ProjectedSphericalTriangle", "ProjectedSphericalTriangle sampler", "ProjectedSphericalTriangleTestLog.txt"); + runSamplerTest.operator()("sampler/SphericalRectangle", "SphericalRectangle sampler", "SphericalRectangleTestLog.txt"); + runSamplerTest.operator()("sampler/ProjectedSphericalRectangle", "ProjectedSphericalRectangle sampler", "ProjectedSphericalRectangleTestLog.txt"); } if constexpr (true) { // --- Discrete table construction (CPU) --- { - m_logger->log("Running discrete table builder tests (CPU)...", ILogger::ELL_INFO); - CDiscreteTableTester tableTester(m_logger.get()); - pass &= tableTester.run(); + constexpr const char* id = "sampler/DiscreteTableBuilder"; + if (!runControl.filter.shouldRun(id)) + { + m_logger->log("Skipping discrete table builder tests due to filter.", ILogger::ELL_INFO); + } + else + { + m_logger->log("Running discrete table builder tests (CPU)...", ILogger::ELL_INFO); + CDiscreteTableTester tableTester(m_logger.get()); + const bool ok = tableTester.run(); + samplerPass &= ok; + if (!ok) + failureManifest.addGroupFailure("sampler", id, "Discrete table builder"); + } } // --- GPU table sampler tests --- - runSamplerTest.operator()("PackedAliasA GPU sampler", "PackedAliasATestLog.txt"); - runSamplerTest.operator()("PackedAliasB GPU sampler", "PackedAliasBTestLog.txt"); - runSamplerTest.operator()("CumulativeProbability GPU sampler", "CumulativeProbabilityTestLog.txt"); + runSamplerTest.operator()("sampler/PackedAliasA", "PackedAliasA GPU sampler", "PackedAliasATestLog.txt"); + runSamplerTest.operator()("sampler/PackedAliasB", "PackedAliasB GPU sampler", "PackedAliasBTestLog.txt"); + runSamplerTest.operator()("sampler/CumulativeProbability", "CumulativeProbability GPU sampler", "CumulativeProbabilityTestLog.txt"); } logJacobianSkipCounts(m_logger.get()); - if (pass) + pass &= samplerPass; + if (samplerPass) m_logger->log("All sampling tests PASSED.", ILogger::ELL_INFO); else m_logger->log("Some sampling tests FAILED. Check log files for details.", ILogger::ELL_ERROR); @@ -413,12 +449,28 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ================================================================ if constexpr (true) { + bool propertyPass = true; m_logger->log("Running sampler property tests (CPU)...", ILogger::ELL_INFO); m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING); auto check = [&]() { - pass &= CSamplerPropertyTester(m_logger.get()).run(); + const std::string id = std::string("property/") + Config::name(); + if (!runControl.filter.shouldRun(id)) + { + m_logger->log("Skipping %s property tests due to filter.", ILogger::ELL_INFO, Config::name()); + return; + } + + CSamplerPropertyTester tester(m_logger.get(), runControl.filter.seedFor(id)); + const bool ok = tester.run(); + propertyPass &= ok; + if (!ok) + { + failureManifest.addGroupFailure("property", id, Config::name()); + if (const auto seed = tester.failureSeed(); seed.has_value()) + failureManifest.addCase("property", id, Config::name(), "property", "CPU", 0, *seed, 0.0, 0.0); + } }; check.operator()(); @@ -444,7 +496,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Grazing angle tests check.operator()(); - if (pass) + pass &= propertyPass; + if (propertyPass) m_logger->log("All sampler property tests PASSED.", ILogger::ELL_INFO); else m_logger->log("Some sampler property tests FAILED.", ILogger::ELL_ERROR); @@ -455,27 +508,41 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ================================================================ if constexpr (true) { + bool geometryPass = true; m_logger->log("Running geometry tests (CPU)...", ILogger::ELL_INFO); m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING); - auto check = [&]() + auto check = [&](const char* id, const char* name) { - pass &= Tester(m_logger.get()).run(); + if (!runControl.filter.shouldRun(id)) + { + m_logger->log("Skipping %s geometry tests due to filter.", ILogger::ELL_INFO, name); + return; + } + + const bool ok = Tester(m_logger.get()).run(); + geometryPass &= ok; + if (!ok) + failureManifest.addGroupFailure("geometry", id, name); }; - check.template operator()(); - check.template operator()(); - check.template operator()(); - check.template operator()(); - check.template operator()(); - check.template operator()(); + check.template operator()("geometry/SolidAngleAccuracy", "SolidAngleAccuracy"); + check.template operator()("geometry/SphericalTriangleGenerate", "SphericalTriangleGenerate"); + check.template operator()("geometry/SphericalRectangleGenerate", "SphericalRectangleGenerate"); + check.template operator()("geometry/ProjectedSphericalRectangleGenerate", "ProjectedSphericalRectangleGenerate"); + check.template operator()("geometry/ProjectedSphericalRectangle", "ProjectedSphericalRectangle"); + check.template operator()("geometry/ProjectedSphericalTriangle", "ProjectedSphericalTriangle"); - if (pass) + pass &= geometryPass; + if (geometryPass) m_logger->log("All geometry tests PASSED.", ILogger::ELL_INFO); else m_logger->log("Some geometry tests FAILED.", ILogger::ELL_ERROR); } + if (!runControl.failedOutPath.empty()) + pass &= nbl::examples::testing::writeFailureManifestFile(failureManifest, runControl.failedOutPath, m_logger.get()); + return pass; } diff --git a/37_HLSLSamplingTests/tests/SamplerTestHelpers.h b/37_HLSLSamplingTests/tests/SamplerTestHelpers.h index 44dd5f961..1246ebc08 100644 --- a/37_HLSLSamplingTests/tests/SamplerTestHelpers.h +++ b/37_HLSLSamplingTests/tests/SamplerTestHelpers.h @@ -7,6 +7,8 @@ #include #include +#include + // ============================================================================ // Declarative field verification helpers // @@ -475,7 +477,7 @@ struct SeededTestContext std::mt19937 rng; uint32_t failCount = 0; - SeededTestContext() : seed(std::random_device {}()), rng(seed) {} + SeededTestContext(std::optional seedOverride = {}) : seed(seedOverride.value_or(std::random_device {}())), rng(seed) {} // Log "reproduce with seed" if failCount > 0, return failCount == 0 bool finalize(nbl::system::ILogger* logger, const char* tag) const diff --git a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h index ecb0f606d..b20ba88f9 100644 --- a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h +++ b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h @@ -78,7 +78,9 @@ class CSamplerPropertyTester } public: - CSamplerPropertyTester(system::ILogger* logger) : m_logger(logger) {} + CSamplerPropertyTester(system::ILogger* logger, std::optional seedOverride = {}) : m_logger(logger), m_seedOverride(seedOverride) {} + + std::optional failureSeed() const { return m_failureSeed; } bool run() { @@ -96,7 +98,7 @@ class CSamplerPropertyTester // If the PDF normalization is wrong by factor k, this will be off by 1/k. bool testMonteCarloPdfNormalization() { - SeededTestContext ctx; + SeededTestContext ctx(m_seedOverride); uint32_t evaluatedConfigs = 0; for (uint32_t c = 0; c < Config::numConfigurations; c++) @@ -159,7 +161,10 @@ class CSamplerPropertyTester m_logger->log(" [%s] MC normalization FAILED (%u/%u evaluated configs failed, %u/%u configs evaluated, %u samples/config, relTol=%e)", system::ILogger::ELL_ERROR, Config::name(), ctx.failCount, evaluatedConfigs, evaluatedConfigs, Config::numConfigurations, Config::samplesPerConfig, Config::mcNormalizationRelTol); - return ctx.finalize(m_logger, Config::name()); + const bool passed = ctx.finalize(m_logger, Config::name()); + if (!passed) + m_failureSeed = ctx.seed; + return passed; } // Test 4: Grid integration of backwardPdf over [0,1]^d codomain @@ -167,7 +172,7 @@ class CSamplerPropertyTester // integral of backwardPdf over codomain should equal 1.0. bool testGridPdfNormalization() { - SeededTestContext ctx; + SeededTestContext ctx(m_seedOverride); for (uint32_t c = 0; c < Config::numConfigurations; c++) { @@ -191,10 +196,15 @@ class CSamplerPropertyTester m_logger->log(" [%s] grid PDF normalization FAILED (%u/%u configs exceeded absTol=%e)", system::ILogger::ELL_ERROR, Config::name(), ctx.failCount, Config::numConfigurations, Config::gridNormalizationAbsTol); - return ctx.finalize(m_logger, Config::name()); + const bool passed = ctx.finalize(m_logger, Config::name()); + if (!passed) + m_failureSeed = ctx.seed; + return passed; } system::ILogger* m_logger; + std::optional m_seedOverride; + std::optional m_failureSeed; }; diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp index 11a6d2f55..549596bac 100644 --- a/64_EmulatedFloatTest/main.cpp +++ b/64_EmulatedFloatTest/main.cpp @@ -91,20 +91,22 @@ class CEF64Benchmark : public GPUBenchmark void doRun() override { - const PipelineEntry& pe = m_pipelines[m_pipelineIdx]; + const PipelineEntry* pe = getPipelineEntry(m_pipelineIdx, joinName(m_name)); + if (!pe) + return; BenchmarkPushConstants pc = {}; pc.benchmarkMode = m_mode; const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(), [&](IGPUCommandBuffer* cb) { - cb->bindDescriptorSets(EPBP_COMPUTE, pe.layout.get(), 0, 1, &m_ds.get()); - defaultBindAndPush(cb, pe, pc); + cb->bindDescriptorSets(EPBP_COMPUTE, pe->layout.get(), 0, 1, &m_ds.get()); + defaultBindAndPush(cb, *pe, pc); }, [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); }, samplesForCurrentRow()); - record(m_name, t, pe.stats); + record(m_name, t, pe->stats); } private: @@ -1055,4 +1057,4 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso std::ofstream m_logFile; }; -NBL_MAIN_FUNC(CompatibilityTest) \ No newline at end of file +NBL_MAIN_FUNC(CompatibilityTest) diff --git a/common/include/nbl/examples/Benchmark/BenchmarkJson.h b/common/include/nbl/examples/Benchmark/BenchmarkJson.h index dc7846848..e6d3fff24 100644 --- a/common/include/nbl/examples/Benchmark/BenchmarkJson.h +++ b/common/include/nbl/examples/Benchmark/BenchmarkJson.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -85,32 +86,52 @@ inline std::optional loadBaselineFile(std::string label, const std::st continue; BaselineRow row; - row.psPerSample = ps->get(); - row.registerCount = r.at("regs").get(); - row.codeSizeBytes = r.at("code_bytes").get(); - row.sharedMemBytes = r.at("shared_mem_bytes").get(); - row.privateMemBytes = r.at("local_mem_bytes").get(); - row.stackBytes = r.at("stack_bytes").get(); - row.subgroupSize = r.at("subgroup_size").get(); + try + { + row.psPerSample = ps->get(); + } + catch (const std::exception&) + { + continue; + } + + auto readU64 = [&](const char* key, uint64_t& out) + { + const auto it = r.find(key); + if (it != r.end() && it->is_number_unsigned()) + out = it->get(); + }; + readU64("regs", row.registerCount); + readU64("code_bytes", row.codeSizeBytes); + readU64("shared_mem_bytes", row.sharedMemBytes); + readU64("local_mem_bytes", row.privateMemBytes); + readU64("stack_bytes", row.stackBytes); + readU64("subgroup_size", row.subgroupSize); auto readUvec3 = [&](const char* key, nbl::hlsl::uint32_t3& out) { - const auto& a = r.at(key); - out.x = a[0].get(); - out.y = a[1].get(); - out.z = a[2].get(); + const auto it = r.find(key); + if (it == r.end() || !it->is_array() || it->size() != 3) + return; + const auto& a = *it; + if (!a[0].is_number_unsigned() || !a[1].is_number_unsigned() || !a[2].is_number_unsigned()) + return; + out.x = a[0].get(); + out.y = a[1].get(); + out.z = a[2].get(); }; readUvec3("workgroup_size", row.workload.shape.workgroupSize); readUvec3("dispatch_groups", row.workload.shape.dispatchGroupCount); - row.workload.shape.samplesPerDispatch = r.at("samples_per_dispatch").get(); - row.workload.benchDispatches = r.at("bench_dispatches").get(); + readU64("samples_per_dispatch", row.workload.shape.samplesPerDispatch); + if (const auto it = r.find("bench_dispatches"); it != r.end() && it->is_number_unsigned()) + row.workload.benchDispatches = it->get(); rowsByName[makeKey(nameVec)] = row; } if (rowsByName.empty()) return std::nullopt; - return Baseline {std::move(label), path, j.at("device"), std::move(rowsByName)}; + return Baseline {std::move(label), path, j.contains("device") ? j["device"] : nullptr, std::move(rowsByName)}; } // Writes a JSON report. Preserves rows in the prior file whose names weren't diff --git a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h index 3cd5d7a91..553e5a21b 100644 --- a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h +++ b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h @@ -344,9 +344,7 @@ class GPUBenchmarkHelper nbl::core::smart_refctd_ptr dsLayout = nullptr) { using namespace nbl; - const uint32_t idx = uint32_t(m_pipelines.size()); - m_pipelines.push_back({.tag = tag}); - PipelineEntry& slot = m_pipelines.back(); + PipelineEntry slot = {.tag = tag}; const asset::SPushConstantRange pcRange = { .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, @@ -359,7 +357,7 @@ class GPUBenchmarkHelper if (!layout) { benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): pipeline layout creation failed", tag); - return idx; + return InvalidPipelineIndex; } auto source = loadShader(variant, std::move(assetMgr)); @@ -367,7 +365,7 @@ class GPUBenchmarkHelper if (!shader) { benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): shader load/compile failed", tag); - return idx; + return InvalidPipelineIndex; } video::IGPUComputePipeline::SCreationParams pp = {}; @@ -381,7 +379,7 @@ class GPUBenchmarkHelper if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline) || !pipeline) { benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): createComputePipelines failed", tag); - return idx; + return InvalidPipelineIndex; } if (m_device->getEnabledFeatures().pipelineExecutableInfo) @@ -408,6 +406,8 @@ class GPUBenchmarkHelper slot.layout = std::move(layout); slot.pipeline = std::move(pipeline); + const uint32_t idx = uint32_t(m_pipelines.size()); + m_pipelines.push_back(std::move(slot)); return idx; } @@ -652,6 +652,18 @@ class GPUBenchmarkHelper } protected: + static constexpr uint32_t InvalidPipelineIndex = std::numeric_limits::max(); + + const PipelineEntry* getPipelineEntry(uint32_t idx, std::string_view context) const + { + if (idx == InvalidPipelineIndex || idx >= m_pipelines.size() || !m_pipelines[idx].pipeline) + { + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "{}: pipeline is not available", context); + return nullptr; + } + return &m_pipelines[idx]; + } + std::vector m_pipelines; private: diff --git a/common/include/nbl/examples/Tester/FailureManifest.h b/common/include/nbl/examples/Tester/FailureManifest.h new file mode 100644 index 000000000..a703e933e --- /dev/null +++ b/common/include/nbl/examples/Tester/FailureManifest.h @@ -0,0 +1,331 @@ +#ifndef _NBL_COMMON_TESTER_FAILURE_MANIFEST_INCLUDED_ +#define _NBL_COMMON_TESTER_FAILURE_MANIFEST_INCLUDED_ + +#include + +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nbl::examples::testing +{ + +struct FailureCase +{ + std::string check; + std::string side; + uint64_t iteration = 0; + uint32_t seed = 0; + double maxRelative = 0.0; + double maxAbsolute = 0.0; +}; + +struct FailureGroup +{ + std::string phase; + std::string id; + std::string name; + std::string logFile; + std::vector cases; + uint32_t omittedCases = 0; +}; + +class FailureManifest +{ + public: + explicit FailureManifest(std::string suite = {}) : m_suite(std::move(suite)) {} + + void setSuite(std::string suite) { m_suite = std::move(suite); } + + void addGroupFailure(std::string_view phase, std::string_view id, std::string_view name, std::string_view logFile = {}) + { + auto& group = groupFor(phase, id, name); + if (!logFile.empty()) + group.logFile = std::string(logFile); + } + + void addCase(std::string_view phase, std::string_view id, std::string_view name, std::string_view check, std::string_view side, + uint64_t iteration, uint32_t seed, double maxRelative, double maxAbsolute) + { + auto& group = groupFor(phase, id, name); + if (group.cases.size() >= MaxCasesPerGroup) + { + ++group.omittedCases; + return; + } + + group.cases.push_back(FailureCase{ + .check = std::string(check), + .side = std::string(side), + .iteration = iteration, + .seed = seed, + .maxRelative = maxRelative, + .maxAbsolute = maxAbsolute, + }); + } + + const std::vector& failures() const { return m_failures; } + + nlohmann::json toJson() const + { + nlohmann::json doc; + doc["version"] = 1; + doc["suite"] = m_suite; + auto& failures = doc["failures"] = nlohmann::json::array(); + + for (const auto& group : m_failures) + { + nlohmann::json g; + g["phase"] = group.phase; + g["id"] = group.id; + g["name"] = group.name; + if (!group.logFile.empty()) + g["log_file"] = group.logFile; + + auto& cases = g["cases"] = nlohmann::json::array(); + for (const auto& c : group.cases) + { + nlohmann::json row; + row["check"] = c.check; + row["side"] = c.side; + row["iteration"] = c.iteration; + row["seed"] = c.seed; + row["max_relative"] = c.maxRelative; + row["max_absolute"] = c.maxAbsolute; + cases.push_back(std::move(row)); + } + + if (group.omittedCases > 0) + g["omitted_cases"] = group.omittedCases; + + failures.push_back(std::move(g)); + } + + return doc; + } + + private: + static constexpr size_t MaxCasesPerGroup = 64; + + FailureGroup& groupFor(std::string_view phase, std::string_view id, std::string_view name) + { + const std::string idString(id); + auto it = std::find_if(m_failures.begin(), m_failures.end(), [&](const FailureGroup& g) { return g.id == idString; }); + if (it != m_failures.end()) + { + if (it->name.empty()) + it->name = std::string(name); + if (it->phase.empty()) + it->phase = std::string(phase); + return *it; + } + + m_failures.push_back(FailureGroup{ + .phase = std::string(phase), + .id = idString, + .name = std::string(name), + }); + return m_failures.back(); + } + + std::string m_suite; + std::vector m_failures; +}; + +class TestFilter +{ + public: + bool enabled() const { return m_enabled; } + + void enable() { m_enabled = true; } + + bool shouldRun(std::string_view id) const + { + return !m_enabled || m_ids.contains(std::string(id)); + } + + void add(std::string_view id) + { + m_enabled = true; + const auto first = id.find_first_not_of(" \t\r\n"); + if (first == std::string_view::npos) + return; + const auto last = id.find_last_not_of(" \t\r\n"); + m_ids.insert(std::string(id.substr(first, last - first + 1))); + } + + void addSeed(std::string_view id, uint32_t seed) + { + add(id); + m_seeds[std::string(id)] = seed; + } + + void addList(std::string_view ids) + { + m_enabled = true; + while (!ids.empty()) + { + const auto comma = ids.find(','); + add(ids.substr(0, comma)); + if (comma == std::string_view::npos) + return; + ids.remove_prefix(comma + 1); + } + } + + std::optional seedFor(std::string_view id) const + { + auto it = m_seeds.find(std::string(id)); + if (it == m_seeds.end()) + return {}; + return it->second; + } + + private: + bool m_enabled = false; + std::set m_ids; + std::map m_seeds; +}; + +struct RunControl +{ + bool valid = true; + bool skipBenchmarks = false; + std::string failedOutPath; + TestFilter filter; +}; + +inline bool addFailedIdsFromFile(TestFilter& filter, const std::string& path, nbl::system::ILogger* logger) +{ + filter.enable(); + std::ifstream in(path); + if (!in.is_open()) + { + if (logger) + logger->log("Failed to open failed-test manifest '%s'", nbl::system::ILogger::ELL_ERROR, path.c_str()); + return false; + } + + nlohmann::json doc; + try + { + in >> doc; + } + catch (const std::exception& e) + { + if (logger) + logger->log("Failed to parse failed-test manifest '%s': %s", nbl::system::ILogger::ELL_ERROR, path.c_str(), e.what()); + return false; + } + + const auto failuresIt = doc.find("failures"); + if (failuresIt == doc.end() || !failuresIt->is_array()) + { + if (logger) + logger->log("Failed-test manifest '%s' does not contain a failures array", nbl::system::ILogger::ELL_ERROR, path.c_str()); + return false; + } + + for (const auto& failure : *failuresIt) + { + if (!failure.is_object()) + continue; + const auto idIt = failure.find("id"); + if (idIt != failure.end() && idIt->is_string()) + { + const std::string id = idIt->get(); + const auto casesIt = failure.find("cases"); + if (casesIt != failure.end() && casesIt->is_array()) + { + const auto seedIt = std::find_if(casesIt->begin(), casesIt->end(), [](const nlohmann::json& row) { + if (!row.is_object()) + return false; + const auto it = row.find("seed"); + return it != row.end() && it->is_number_integer(); + }); + if (seedIt != casesIt->end()) + { + filter.addSeed(id, (*seedIt)["seed"].get()); + continue; + } + } + filter.add(id); + } + } + + return true; +} + +inline RunControl parseRunControl(std::span argv, nbl::system::ILogger* logger) +{ + RunControl out; + + for (size_t i = 1; i < argv.size(); ++i) + { + const std::string& arg = argv[i]; + if (arg == "--skip-benchmarks") + out.skipBenchmarks = true; + else if (arg == "--failed-out" && i + 1 < argv.size()) + out.failedOutPath = argv[++i]; + else if (arg.starts_with("--failed-out=")) + out.failedOutPath = arg.substr(std::string("--failed-out=").size()); + else if (arg == "--test" && i + 1 < argv.size()) + out.filter.addList(argv[++i]); + else if (arg.starts_with("--test=")) + out.filter.addList(std::string_view(arg).substr(std::string_view("--test=").size())); + else if (arg == "--rerun-failed" && i + 1 < argv.size()) + { + if (!addFailedIdsFromFile(out.filter, argv[++i], logger)) + out.valid = false; + } + else if (arg.starts_with("--rerun-failed=")) + { + if (!addFailedIdsFromFile(out.filter, arg.substr(std::string("--rerun-failed=").size()), logger)) + out.valid = false; + } + } + + if (out.filter.enabled()) + out.skipBenchmarks = true; + + return out; +} + +inline bool writeFailureManifestFile(const FailureManifest& manifest, const std::string& path, nbl::system::ILogger* logger) +{ + std::ofstream out(path, std::ios::out | std::ios::trunc); + if (!out.is_open()) + { + if (logger) + logger->log("Failed to open failed-test manifest '%s' for writing", nbl::system::ILogger::ELL_ERROR, path.c_str()); + return false; + } + + out << manifest.toJson().dump(3) << '\n'; + if (!out.good()) + { + if (logger) + logger->log("Failed to write failed-test manifest '%s'", nbl::system::ILogger::ELL_ERROR, path.c_str()); + return false; + } + + if (logger) + logger->log("Wrote failed-test manifest '%s' with %llu failed groups", nbl::system::ILogger::ELL_INFO, + path.c_str(), static_cast(manifest.failures().size())); + return true; +} + +} // namespace nbl::examples::testing + +#endif diff --git a/common/include/nbl/examples/Tester/ITester.h b/common/include/nbl/examples/Tester/ITester.h index 8fd4c6639..27dfccff2 100644 --- a/common/include/nbl/examples/Tester/ITester.h +++ b/common/include/nbl/examples/Tester/ITester.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -171,6 +172,7 @@ class ITester bool performTestsAndVerifyResults(const std::string& logFileName) { + m_failureLogFile = logFileName; m_logFile.open(logFileName, std::ios::out | std::ios::trunc); if (!m_logFile.is_open()) m_logger->log("Failed to open log file!", system::ILogger::ELL_ERROR); @@ -197,6 +199,8 @@ class ITester core::vector gpuTestResults = performGpuTests(inputTestValues); bool pass = verifyAllTestResults(cpuTestResults, gpuTestResults, exceptedTestResults); + if (!pass && m_failureManifest) + m_failureManifest->addGroupFailure(m_failurePhase, m_failureId, m_failureName, m_failureLogFile); m_logger->log("TESTS DONE.", system::ILogger::ELL_PERFORMANCE); reloadSeed(); @@ -205,6 +209,20 @@ class ITester return pass; } + void setFailureRecordContext(nbl::examples::testing::FailureManifest* manifest, std::string phase, std::string id, std::string name) + { + m_failureManifest = manifest; + m_failurePhase = std::move(phase); + m_failureId = std::move(id); + m_failureName = std::move(name); + } + + void setSeed(uint32_t seed) + { + m_seed = seed; + m_mersenneTwister = std::mt19937(m_seed); + } + virtual ~ITester() { m_outputBufferAllocation.memory->unmap(); @@ -339,6 +357,13 @@ class ITester ss << " DIFFERENCE: " << system::to_string(hlsl::abs(expectedVal - testVal)); ss << " MAX RELATIVE: " << system::to_string(maxRelativeDifference) << " MAX ABSOLUTE " << system::to_string(maxAbsoluteDifference) << '\n'; + if (m_failureManifest) + { + const char* side = testType == TestType::CPU ? "CPU" : "GPU"; + m_failureManifest->addCase(m_failurePhase, m_failureId, m_failureName, memberName, side, + testIteration, seed, maxRelativeDifference, maxAbsoluteDifference); + } + m_logger->log("%s", system::ILogger::ELL_ERROR, ss.str().c_str()); m_logFile << ss.str() << '\n'; } @@ -439,6 +464,11 @@ class ITester uint32_t m_seed; std::ofstream m_logFile; core::unordered_map m_maxErrors; + nbl::examples::testing::FailureManifest* m_failureManifest = nullptr; + std::string m_failurePhase; + std::string m_failureId; + std::string m_failureName; + std::string m_failureLogFile; }; -#endif \ No newline at end of file +#endif