MayaFlux 0.4.0
Digital-First Multimedia Processing Framework
Loading...
Searching...
No Matches
GpuDispatchCore.cpp
Go to the documentation of this file.
1#include "GpuDispatchCore.hpp"
2
4
5namespace MayaFlux::Yantra {
6
8 : m_gpu_config(std::move(config))
9{
10}
11
12//==============================================================================
13// Public interface
14//==============================================================================
15
16void GpuDispatchCore::set_push_constants(const void* data, size_t bytes)
17{
18 m_push_constants.resize(bytes);
19 std::memcpy(m_push_constants.data(), data, bytes);
20}
21
22void GpuDispatchCore::set_output_size(size_t index, size_t byte_size)
23{
24 if (index >= m_output_size_overrides.size())
25 m_output_size_overrides.resize(index + 1, 0);
26 m_output_size_overrides[index] = byte_size;
27}
28
36
38{
39 return m_resources.is_ready();
40}
41
42std::shared_ptr<Core::VKImage> GpuDispatchCore::get_output_image(size_t binding_index) const
43{
44 if (binding_index >= m_image_bindings.size())
45 return nullptr;
46 return m_image_bindings[binding_index].image;
47}
48
53
54//==============================================================================
55// Protected staging helpers
56//==============================================================================
57
58void GpuDispatchCore::stage_passthrough(size_t binding_index, const void* data, size_t byte_size)
59{
60 if (binding_index >= m_passthrough_bytes.size())
61 m_passthrough_bytes.resize(binding_index + 1);
62 auto& slot = m_passthrough_bytes[binding_index];
63 slot.resize(byte_size);
64 std::memcpy(slot.data(), data, byte_size);
65}
66
67void GpuDispatchCore::stage_image_storage(size_t binding_index, std::shared_ptr<Core::VKImage> image)
68{
69 if (binding_index >= m_image_bindings.size())
70 m_image_bindings.resize(binding_index + 1);
71 m_image_bindings[binding_index] = { .image = std::move(image), .sampler = nullptr, .kind = GpuBufferBinding::ElementType::IMAGE_STORAGE };
72}
73
74void GpuDispatchCore::stage_image_sampled(size_t binding_index,
75 std::shared_ptr<Core::VKImage> image,
76 vk::Sampler sampler)
77{
78 if (binding_index >= m_image_bindings.size())
79 m_image_bindings.resize(binding_index + 1);
80 m_image_bindings[binding_index] = { .image = std::move(image), .sampler = sampler, .kind = GpuBufferBinding::ElementType::IMAGE_SAMPLED };
81}
82
83//==============================================================================
84// Virtual override points
85//==============================================================================
86
87std::vector<GpuBufferBinding> GpuDispatchCore::declare_buffer_bindings() const
88{
89 return {
90 { .set = 0, .binding = 0, .direction = GpuBufferBinding::Direction::INPUT, .element_type = GpuBufferBinding::ElementType::FLOAT32 },
91 { .set = 0, .binding = 1, .direction = GpuBufferBinding::Direction::OUTPUT, .element_type = GpuBufferBinding::ElementType::FLOAT32 },
92 };
93}
94
96 const std::vector<std::vector<double>>&,
97 const DataStructureInfo&)
98{
99}
100
102 const std::vector<std::vector<double>>& channels,
103 const DataStructureInfo& structure_info)
104{
105 flatten_channels_to_staging(channels, structure_info);
106 const size_t float_byte_size = m_staging_floats.size() * sizeof(float);
107
108 const size_t fallback_bytes = float_byte_size > 0
109 ? float_byte_size
110 : Kakshya::ContainerDataStructure::get_total_elements(structure_info.dimensions) * sizeof(float);
111
112 for (size_t i = 0; i < m_bindings.size(); ++i) {
113 const auto& b = m_bindings[i];
114
115 if (i < m_binding_data.size() && !m_binding_data[i].empty()) {
117 m_resources.upload_raw(i, m_binding_data[i].data(), m_binding_data[i].size());
118 continue;
119 }
120
121 if (b.direction == GpuBufferBinding::Direction::OUTPUT) {
122 const auto et = b.element_type;
125 } else {
126 const size_t sz = (i < m_output_size_overrides.size() && m_output_size_overrides[i] > 0)
128 : fallback_bytes;
130 if (i < m_output_size_overrides.size() && m_output_size_overrides[i] > 0) {
131 std::vector<uint8_t> zeros(sz, 0);
132 m_resources.upload_raw(i, zeros.data(), sz);
133 }
134 continue;
135 }
136 }
137
138 switch (b.element_type) {
140 if (i < m_passthrough_bytes.size() && !m_passthrough_bytes[i].empty()) {
144 }
145 break;
146
148 if (i >= m_image_bindings.size() || !m_image_bindings[i].image)
149 continue;
150 auto& img = m_image_bindings[i].image;
151 if (img->get_current_layout() != vk::ImageLayout::eGeneral) {
152 m_resources.transition_image(img, img->get_current_layout(),
153 vk::ImageLayout::eGeneral);
154 }
156 } break;
157
159 if (i >= m_image_bindings.size() || !m_image_bindings[i].image)
160 continue;
161 auto& img = m_image_bindings[i].image;
162 auto sampler = m_image_bindings[i].sampler;
163 if (img->get_current_layout() != vk::ImageLayout::eShaderReadOnlyOptimal) {
164 m_resources.transition_image(img, img->get_current_layout(),
165 vk::ImageLayout::eShaderReadOnlyOptimal);
166 }
167 m_resources.bind_image_sampled(i, img, sampler, b);
168 } break;
169
172 if (!channels.empty()) {
173 const size_t raw_bytes = channels[0].size()
175 ? sizeof(uint32_t)
176 : sizeof(int32_t));
177 m_resources.ensure_buffer(i, raw_bytes);
179 reinterpret_cast<const uint8_t*>(channels[0].data()),
180 raw_bytes);
181 }
182 break;
183
185 default:
186 m_resources.ensure_buffer(i, float_byte_size);
187 m_resources.upload(i, m_staging_floats.data(), float_byte_size);
188 break;
189 }
190 }
191}
192
194 size_t total_elements, const DataStructureInfo& structure_info) const
195{
196 uint64_t sz_x = 0, sz_y = 0, sz_z = 0;
197 for (const auto& dim : structure_info.dimensions) {
198 switch (dim.role) {
200 sz_x = dim.size;
201 break;
203 sz_y = dim.size;
204 break;
206 sz_z = dim.size;
207 break;
208 default:
209 break;
210 }
211 }
212
213 const auto& ws = m_gpu_config.workgroup_size;
214 if (sz_x > 0) {
215 return {
216 static_cast<uint32_t>((sz_x + ws[0] - 1) / ws[0]),
217 sz_y > 0 ? static_cast<uint32_t>((sz_y + ws[1] - 1) / ws[1]) : 1U,
218 sz_z > 0 ? static_cast<uint32_t>((sz_z + ws[2] - 1) / ws[2]) : 1U,
219 };
220 }
221
222 return { static_cast<uint32_t>((total_elements + ws[0] - 1) / ws[0]), 1U, 1U };
223}
224
225//==============================================================================
226// Dispatch
227//==============================================================================
228
230 const std::vector<std::vector<double>>& channels,
231 const DataStructureInfo& structure_info)
232{
233 on_before_gpu_dispatch(channels, structure_info);
234 prepare_gpu_inputs(channels, structure_info);
235
236 for (size_t i = 0; i < m_bindings.size(); ++i) {
237 const auto et = m_bindings[i].element_type;
241 }
242
243 const size_t effective = m_staging_floats.empty()
245 : m_staging_floats.size();
246 const auto groups = calculate_dispatch_size(effective, structure_info);
247
249 groups, m_bindings,
250 m_push_constants.empty() ? nullptr : m_push_constants.data(),
251 m_push_constants.size());
252
253 GpuChannelResult result;
254 result.primary = readback_primary(effective);
255 readback_aux(result);
256 return result;
257}
258
260 const std::vector<std::vector<double>>& channels,
261 const DataStructureInfo& structure_info,
262 const ExecutionContext& ctx)
263{
264 on_before_gpu_dispatch(channels, structure_info);
265 prepare_gpu_inputs(channels, structure_info);
266
267 for (size_t i = 0; i < m_bindings.size(); ++i) {
268 const auto et = m_bindings[i].element_type;
272 }
273
274 const size_t effective = m_staging_floats.empty()
276 : m_staging_floats.size();
277 const auto groups = calculate_dispatch_size(effective, structure_info);
278
279 if (!ctx.execution_metadata.contains("pass_count") || !ctx.execution_metadata.contains("pc_updater")) {
280 error<std::runtime_error>(Journal::Component::Yantra,
282 std::source_location::current(),
283 "GpuDispatchCore: dispatch_core_chained requires 'pass_count' and 'pc_updater' in execution_metadata");
284 }
285
286 const auto pass_count = safe_any_cast_or_throw<uint32_t>(ctx.execution_metadata.at("pass_count"));
287 const auto& pc_updater = safe_any_cast_or_throw<std::function<void(uint32_t, void*)>>(ctx.execution_metadata.at("pc_updater"));
288
290 pass_count, groups, m_bindings,
291 [&](uint32_t pass, std::vector<uint8_t>& pc_data) { pc_updater(pass, pc_data.data()); },
294
295 GpuChannelResult result;
296 result.primary = readback_primary(effective);
297 readback_aux(result);
298 return result;
299}
300
301//==============================================================================
302// Readback helpers
303//==============================================================================
304
305std::vector<float> GpuDispatchCore::readback_primary(size_t float_count)
306{
307 const size_t idx = find_first_output_index();
308
309 if (idx < m_bindings.size()) {
310 const auto et = m_bindings[idx].element_type;
313 return {};
314 }
315
316 const size_t allocated = m_resources.buffer_allocated_bytes(idx);
317 const size_t byte_size = std::min(float_count * sizeof(float), allocated);
318 std::vector<float> out(byte_size / sizeof(float));
319 m_resources.download(idx, out.data(), byte_size);
320 return out;
321}
322
324{
325 for (size_t i = 0; i < m_bindings.size(); ++i) {
326 const auto dir = m_bindings[i].direction;
327 const auto et = m_bindings[i].element_type;
332 && !is_image
333 && i < m_output_size_overrides.size()
334 && m_output_size_overrides[i] > 0) {
335 const size_t sz = m_output_size_overrides[i];
336 std::vector<uint8_t> raw(sz);
337 m_resources.download(i, reinterpret_cast<float*>(raw.data()), sz);
338 result.aux[i] = std::move(raw);
339 }
340 }
341}
342
343//==============================================================================
344// Internal helpers
345//==============================================================================
346
348 const std::vector<std::vector<double>>& channels,
349 const DataStructureInfo& structure_info)
350{
351 m_staging_floats.clear();
352
353 if (Kakshya::is_structured_modality(structure_info.modality))
354 return;
355
356 bool all_inputs_staged = !m_bindings.empty();
357 for (size_t i = 0; i < m_bindings.size(); ++i) {
359 continue;
360 if (i >= m_binding_data.size() || m_binding_data[i].empty()) {
361 all_inputs_staged = false;
362 break;
363 }
364 }
365 if (all_inputs_staged)
366 return;
367
368 size_t total = 0;
369
370 for (const auto& ch : channels)
371 total += ch.size();
372 m_staging_floats.reserve(total);
373
374 for (const auto& ch : channels) {
375 for (double v : ch)
376 m_staging_floats.push_back(static_cast<float>(v));
377 }
378}
379
381{
382 size_t first_inout = SIZE_MAX;
383 for (size_t i = 0; i < m_bindings.size(); ++i) {
385 return i;
387 && first_inout == SIZE_MAX)
388 first_inout = i;
389 }
390 if (first_inout != SIZE_MAX)
391 return first_inout;
392
393 error<std::runtime_error>(Journal::Component::Yantra,
395 std::source_location::current(),
396 "GpuDispatchCore: no output buffer declared");
397}
398
400{
401 size_t max_bytes = 0;
402
403 for (size_t i = 0; i < m_bindings.size(); ++i) {
405 continue;
406 if (i < m_binding_data.size() && !m_binding_data[i].empty())
407 max_bytes = std::max(max_bytes, m_binding_data[i].size());
408 }
409
410 return max_bytes / sizeof(float);
411}
412
413} // namespace MayaFlux::Yantra
size_t b
IO::ImageData image
Range size
GpuDispatchCore(GpuShaderConfig config)
void readback_aux(GpuChannelResult &result)
Read back all OUTPUT bindings that have explicit size overrides into the aux map of a GpuChannelResul...
std::vector< ImageBinding > m_image_bindings
virtual std::array< uint32_t, 3 > calculate_dispatch_size(size_t total_elements, const DataStructureInfo &structure_info) const
Calculate workgroup dispatch counts from structure dimensions.
bool is_gpu_ready() const
Query GPU readiness without attempting initialisation.
std::shared_ptr< Core::VKImage > get_output_image(size_t binding_index) const
Return the image registered at an IMAGE_STORAGE output binding.
std::vector< uint8_t > m_push_constants
std::vector< std::vector< uint8_t > > m_binding_data
GpuChannelResult dispatch_core(const std::vector< std::vector< double > > &channels, const DataStructureInfo &structure_info)
Full single-pass dispatch.
void stage_passthrough(size_t binding_index, const void *data, size_t byte_size)
Stage raw bytes for a PASSTHROUGH binding before dispatch.
std::vector< GpuBufferBinding > m_bindings
bool ensure_gpu_ready()
Ensure GPU resources are initialised.
std::vector< size_t > m_output_size_overrides
void stage_image_storage(size_t binding_index, std::shared_ptr< Core::VKImage > image)
Register a VKImage for an IMAGE_STORAGE binding.
void stage_image_sampled(size_t binding_index, std::shared_ptr< Core::VKImage > image, vk::Sampler sampler)
Register a VKImage + sampler for an IMAGE_SAMPLED binding.
const GpuShaderConfig & gpu_config() const
GpuChannelResult dispatch_core_chained(const std::vector< std::vector< double > > &channels, const DataStructureInfo &structure_info, const ExecutionContext &ctx)
Multi-pass (chained) dispatch.
virtual void prepare_gpu_inputs(const std::vector< std::vector< double > > &channels, const DataStructureInfo &structure_info)
Marshal channel data into GPU input buffers.
std::vector< std::vector< uint8_t > > m_passthrough_bytes
virtual void on_before_gpu_dispatch(const std::vector< std::vector< double > > &channels, const DataStructureInfo &structure_info)
Called immediately before dispatch.
void set_output_size(size_t index, size_t byte_size)
Declare the byte capacity of an output binding independently of input data.
std::vector< float > readback_primary(size_t float_count)
Read back the primary output buffer into a float vector.
void flatten_channels_to_staging(const std::vector< std::vector< double > > &channels, const DataStructureInfo &structure_info)
Flatten planar double channels into m_staging_floats.
virtual std::vector< GpuBufferBinding > declare_buffer_bindings() const
Declare the storage buffers the shader expects.
void set_push_constants(const void *data, size_t bytes)
Set push constant data from a raw byte pointer.
size_t buffer_allocated_bytes(size_t index) const
void upload_raw(size_t index, const uint8_t *data, size_t byte_size)
void upload(size_t index, const float *data, size_t byte_size)
void download(size_t index, float *dest, size_t byte_size)
void bind_image_storage(size_t index, const std::shared_ptr< Core::VKImage > &image, const GpuBufferBinding &spec)
Bind a storage image descriptor at the given slot index.
void dispatch_batched(uint32_t pass_count, const std::array< uint32_t, 3 > &groups, const std::vector< GpuBufferBinding > &bindings, const std::function< void(uint32_t pass, std::vector< uint8_t > &)> &push_constant_updater, size_t push_constant_size, const std::unordered_map< std::string, std::any > &execution_metadata={})
bool initialise(const GpuShaderConfig &config, const std::vector< GpuBufferBinding > &bindings)
void bind_image_sampled(size_t index, const std::shared_ptr< Core::VKImage > &image, vk::Sampler sampler, const GpuBufferBinding &spec)
Bind a combined image+sampler descriptor at the given slot index.
void transition_image(const std::shared_ptr< Core::VKImage > &image, vk::ImageLayout old_layout, vk::ImageLayout new_layout)
Transition a VKImage layout via an immediate command submission.
void ensure_buffer(size_t index, size_t required_bytes)
void dispatch(const std::array< uint32_t, 3 > &groups, const std::vector< GpuBufferBinding > &bindings, const uint8_t *push_constant_data, size_t push_constant_size)
void bind_descriptor(size_t index, const GpuBufferBinding &spec)
@ BufferProcessing
Buffer processing (Buffers::BufferManager, processing chains)
@ Runtime
General runtime operations (default fallback)
@ Yantra
DSP algorithms, computational units, matrix operations, Grammar.
bool is_structured_modality(DataModality modality)
Check if a modality represents structured data (vectors, matrices).
Definition NDData.hpp:115
bool is_image(const fs::path &filepath)
Definition Depot.cpp:43
@ SPATIAL_X
Spatial X axis (images, tensors)
std::vector< Kakshya::DataDimension > dimensions
Metadata about data structure for reconstruction.
std::unordered_map< std::string, std::any > execution_metadata
Arbitrary metadata parameters used by operations.
Context information controlling how a compute operation executes.
std::unordered_map< size_t, std::vector< uint8_t > > aux
Erased output of a GPU dispatch: reconstructed float data plus any raw auxiliary outputs keyed by bin...
std::array< uint32_t, 3 > workgroup_size
Plain-data description of the compute shader to dispatch.