Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
400 changes: 149 additions & 251 deletions 37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h

Large diffs are not rendered by default.

275 changes: 41 additions & 234 deletions 37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,249 +7,56 @@

#include <nabla.h>
#include "nbl/examples/examples.hpp"
#include "../app_resources/common/sampler_bench_pc.hlsl"
#include "nbl/examples/Benchmark/IBenchmark.h"
#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h"
#include "app_resources/common/sampler_bench_pc.hlsl"

using namespace nbl;

// Measures GPU execution time of a sampler shader using GPU timestamp queries.
class CSamplerBenchmark
// Output is implicit BDA addressed via SamplerBenchPushConstants. GPU plumbing
// (pipeline / buffer / timestamp queries) comes from GPUBenchmarkHelper; the
// bench-side glue here is PC layout + per-run dispatch + result recording.
class CSamplerBenchmark : public GPUBenchmark
{
public:
struct SetupData
{
core::smart_refctd_ptr<video::ILogicalDevice> device;
core::smart_refctd_ptr<video::CVulkanConnection> api;
core::smart_refctd_ptr<asset::IAssetManager> assetMgr;
core::smart_refctd_ptr<system::ILogger> logger;
video::IPhysicalDevice* physicalDevice;
uint32_t computeFamilyIndex;
std::string shaderKey;
uint32_t dispatchGroupCount; // workgroup count = testBatchCount
uint32_t samplesPerDispatch; // dispatchGroupCount * WorkgroupSize * benchIters
size_t inputBufferBytes; // sizeof(InputType) * samplesPerDispatch
size_t outputBufferBytes; // sizeof(ResultType) * samplesPerDispatch
};

void setup(const SetupData& data)
{
m_device = data.device;
m_logger = data.logger;
m_dispatchGroupCount = data.dispatchGroupCount;

// Single cmdbuf holds [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches]
// so the driver can pipeline adjacent dispatches and the trailing bench dispatches
// aren't measured in a winding-down tail.
m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchmarkCmdbuf))
m_logger->log("CSamplerBenchmark: failed to create benchmark cmdbuf", system::ILogger::ELL_ERROR);

// Timestamp query pool (2 queries: before and after)
{
video::IQueryPool::SCreationParams qparams = {};
qparams.queryType = video::IQueryPool::TYPE::TIMESTAMP;
qparams.queryCount = 2;
qparams.pipelineStatisticsFlags = video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
m_queryPool = m_device->createQueryPool(qparams);
if (!m_queryPool)
m_logger->log("CSamplerBenchmark: failed to create query pool", system::ILogger::ELL_ERROR);
}

// Load and compile shader
core::smart_refctd_ptr<asset::IShader> shader;
{
asset::IAssetLoader::SAssetLoadParams lp = {};
lp.logger = m_logger.get();
lp.workingDirectory = "app_resources";
auto bundle = data.assetMgr->getAsset(data.shaderKey, lp);
const auto assets = bundle.getContents();
if (assets.empty())
{
m_logger->log("CSamplerBenchmark: failed to load shader", system::ILogger::ELL_ERROR);
return;
}
auto source = asset::IAsset::castDown<asset::IShader>(assets[0]);
shader = m_device->compileShader({ source.get() });
}

// Descriptor set layout: binding 0 = input SSBO, binding 1 = output SSBO
video::IGPUDescriptorSetLayout::SBinding bindings[2] = {
{ .binding = 0, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
.createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
.stageFlags = ShaderStage::ESS_COMPUTE, .count = 1 },
{ .binding = 1, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
.createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
.stageFlags = ShaderStage::ESS_COMPUTE, .count = 1 }
};
auto dsLayout = m_device->createDescriptorSetLayout(bindings);

const asset::SPushConstantRange pcRange = {
.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
.offset = 0,
.size = sizeof(SamplerBenchPushConstants),
};
m_pplnLayout = m_device->createPipelineLayout({&pcRange, 1}, core::smart_refctd_ptr(dsLayout));

{
video::IGPUComputePipeline::SCreationParams pparams = {};
pparams.layout = m_pplnLayout.get();
pparams.shader.entryPoint = "main";
pparams.shader.shader = shader.get();
if (m_device->getEnabledFeatures().pipelineExecutableInfo)
{
pparams.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
}
if (!m_device->createComputePipelines(nullptr, { &pparams, 1 }, &m_pipeline))
m_logger->log("CSamplerBenchmark: failed to create compute pipeline", system::ILogger::ELL_ERROR);

if (m_device->getEnabledFeatures().pipelineExecutableInfo)
m_executableReport = system::to_string(m_pipeline->getExecutableInfo());
}

// Allocate input buffer (device-local VRAM, zero-filled via cmdFillBuffer; correctness
// irrelevant for benchmarking but we want deterministic input, not garbage)
core::smart_refctd_ptr<video::IGPUBuffer> inputBuf;
{
video::IGPUBuffer::SCreationParams bparams = {};
bparams.size = data.inputBufferBytes;
bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_TRANSFER_DST_BIT;
inputBuf = m_device->createBuffer(std::move(bparams));
video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuf->getMemoryReqs();
reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits();
m_inputAlloc = m_device->allocate(reqs, inputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE);
if (!m_inputAlloc.isValid())
m_logger->log("CSamplerBenchmark: failed to allocate input buffer memory", system::ILogger::ELL_ERROR);
}

// Allocate output buffer (device-local VRAM, GPU writes, never read back)
core::smart_refctd_ptr<video::IGPUBuffer> outputBuf;
{
video::IGPUBuffer::SCreationParams bparams = {};
bparams.size = data.outputBufferBytes;
bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
outputBuf = m_device->createBuffer(std::move(bparams));
video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuf->getMemoryReqs();
reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits();
m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
if (!m_outputAlloc.isValid())
m_logger->log("CSamplerBenchmark: failed to allocate output buffer memory", system::ILogger::ELL_ERROR);
m_outputAddress = outputBuf->getDeviceAddress();
}

// Zero-fill the input buffer once on the GPU
{
core::smart_refctd_ptr<video::IGPUCommandBuffer> initCmdbuf;
m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &initCmdbuf);
initCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
const asset::SBufferRange<video::IGPUBuffer> range = { .offset = 0, .size = data.inputBufferBytes, .buffer = inputBuf };
initCmdbuf->fillBuffer(range, 0u);
initCmdbuf->end();

auto queue = m_device->getQueue(data.computeFamilyIndex, 0);
const video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = { {.cmdbuf = initCmdbuf.get()} };
video::IQueue::SSubmitInfo submit = {};
submit.commandBuffers = cmds;
queue->submit({&submit, 1u});
m_device->waitIdle();
}

// Descriptor set: bind both buffers
auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 });
m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout));
{
video::IGPUDescriptorSet::SDescriptorInfo info[2];
info[0].desc = core::smart_refctd_ptr(inputBuf);
info[0].info.buffer = { .offset = 0, .size = data.inputBufferBytes };
info[1].desc = core::smart_refctd_ptr(outputBuf);
info[1].info.buffer = { .offset = 0, .size = data.outputBufferBytes };
video::IGPUDescriptorSet::SWriteDescriptorSet writes[2] = {
{ .dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &info[0] },
{ .dstSet = m_ds.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &info[1] }
};
m_device->updateDescriptorSets(writes, {});
}

m_queue = m_device->getQueue(data.computeFamilyIndex, 0);
m_samplesPerDispatch = data.samplesPerDispatch;
m_physicalDevice = data.physicalDevice;
}

void logPipelineReport(const std::string& name) const
public:
struct SetupData : GPUBenchmark::SetupData
{
if (!m_executableReport.empty())
m_logger->log("%s Sampler Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, name.c_str(), m_executableReport.c_str());
}

void run(const std::string& samplerName, const std::string& mode, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000)
{
m_device->waitIdle();

const uint32_t cooldownIterations = warmupIterations;

m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
m_benchmarkCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get());
m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
{
SamplerBenchPushConstants pc = { .outputAddress = m_outputAddress };
m_benchmarkCmdbuf->pushConstants(m_pplnLayout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
}
for (uint32_t i = 0u; i < warmupIterations; ++i)
m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
for (uint32_t i = 0u; i < benchmarkIterations; ++i)
m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
for (uint32_t i = 0u; i < cooldownIterations; ++i)
m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
m_benchmarkCmdbuf->end();
core::smart_refctd_ptr<asset::IAssetManager> assetMgr;
GPUBenchmarkHelper::ShaderVariant variant; // precompiled key OR source path + defines
size_t outputBufferBytes; // sizeof(uint32_t) * threadsPerDispatch
};

auto semaphore = m_device->createSemaphore(0u);
const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} };
const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = {
{.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}
};
video::IQueue::SSubmitInfo submit = {};
submit.commandBuffers = benchCmds;
submit.signalSemaphores = signalSem;
m_queue->submit({&submit, 1u});

m_device->waitIdle();

uint64_t timestamps[2] = {};
const auto flags = core::bitflag(video::IQueryPool::RESULTS_FLAGS::_64_BIT) |
core::bitflag(video::IQueryPool::RESULTS_FLAGS::WAIT_BIT);
m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags);

const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds);
const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod;
const uint64_t total_samples = uint64_t(benchmarkIterations) * uint64_t(m_samplesPerDispatch);
const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(total_samples);
const float64_t gsamples_per_s = float64_t(total_samples) / elapsed_ns;
const float64_t elapsed_ms = elapsed_ns * 1e-6;
CSamplerBenchmark(Aggregator& aggregator, const SetupData& data)
: GPUBenchmark(aggregator, data) // slicing-copy of the GPUBenchmark::SetupData base
{
auto bda = createBdaOutputBuffer(data.outputBufferBytes);
m_outputBuf = std::move(bda.buf);
m_outputAddress = bda.address;

m_logger->log("[Benchmark] %-28s | %-38s | %12.3f | %12.3f | %12.3f",
system::ILogger::ELL_PERFORMANCE,
samplerName.c_str(), mode.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms);
}
m_pipelineIdx = createPipeline(data.variant, data.assetMgr, sizeof(SamplerBenchPushConstants), joinName(data.name));
}

private:
core::smart_refctd_ptr<video::ILogicalDevice> m_device;
core::smart_refctd_ptr<system::ILogger> m_logger;
core::smart_refctd_ptr<video::IGPUCommandPool> m_cmdpool;
core::smart_refctd_ptr<video::IGPUCommandBuffer> m_benchmarkCmdbuf;
core::smart_refctd_ptr<video::IQueryPool> m_queryPool;
core::smart_refctd_ptr<video::IGPUPipelineLayout> m_pplnLayout;
core::smart_refctd_ptr<video::IGPUComputePipeline> m_pipeline;
core::smart_refctd_ptr<video::IGPUDescriptorSet> m_ds;
video::IDeviceMemoryAllocator::SAllocation m_inputAlloc = {};
video::IDeviceMemoryAllocator::SAllocation m_outputAlloc = {};
uint64_t m_outputAddress = 0;
video::IQueue* m_queue = nullptr;
video::IPhysicalDevice* m_physicalDevice = nullptr;
uint32_t m_dispatchGroupCount = 0;
uint32_t m_samplesPerDispatch = 0;
std::string m_executableReport;
void doRun() override
{
const PipelineEntry* pe = getPipelineEntry(m_pipelineIdx, joinName(m_name));
if (!pe)
return;
SamplerBenchPushConstants pc = {};
pc.outputAddress = m_outputAddress;

const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(),
[&](video::IGPUCommandBuffer* cb) { defaultBindAndPush(cb, *pe, pc); },
[this](video::IGPUCommandBuffer* cb) { defaultDispatch(cb); },
samplesForCurrentRow());

record(m_name, t, pe->stats);
}

private:
core::smart_refctd_ptr<video::IGPUBuffer> m_outputBuf;
uint64_t m_outputAddress = 0;
uint32_t m_pipelineIdx = 0;
};

#endif
Loading