mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 11:30:11 +01:00
anv: Implement vkCmdTraceRays and vkCmdTraceRaysIndirect
v2: Fix anv_cmd_state::binding_tables array size (Lionel) v2: Fix anv_cmd_state::samplers array size (Lionel) Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8637>
This commit is contained in:
parent
ac6d7a1758
commit
7479fe6ae0
2 changed files with 245 additions and 4 deletions
|
|
@ -2983,8 +2983,8 @@ struct anv_cmd_state {
|
|||
struct anv_vertex_binding vertex_bindings[MAX_VBS];
|
||||
bool xfb_enabled;
|
||||
struct anv_xfb_binding xfb_bindings[MAX_XFB_BUFFERS];
|
||||
struct anv_state binding_tables[MESA_SHADER_STAGES];
|
||||
struct anv_state samplers[MESA_SHADER_STAGES];
|
||||
struct anv_state binding_tables[MESA_VULKAN_SHADER_STAGES];
|
||||
struct anv_state samplers[MESA_VULKAN_SHADER_STAGES];
|
||||
|
||||
unsigned char sampler_sha1s[MESA_SHADER_STAGES][20];
|
||||
unsigned char surface_sha1s[MESA_SHADER_STAGES][20];
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@
|
|||
#include "common/intel_l3_config.h"
|
||||
#include "genxml/gen_macros.h"
|
||||
#include "genxml/genX_pack.h"
|
||||
#include "genxml/gen_rt_pack.h"
|
||||
|
||||
#include "nir/nir_xfb_info.h"
|
||||
|
||||
|
|
@ -4928,6 +4929,227 @@ void genX(CmdDispatchIndirect)(
|
|||
emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
|
||||
}
|
||||
|
||||
#if GFX_VERx10 >= 125
|
||||
static void
|
||||
calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
|
||||
{
|
||||
unsigned total_shift = 0;
|
||||
memset(local_shift, 0, 3);
|
||||
|
||||
bool progress;
|
||||
do {
|
||||
progress = false;
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
assert(global[i] > 0);
|
||||
if ((1 << local_shift[i]) < global[i]) {
|
||||
progress = true;
|
||||
local_shift[i]++;
|
||||
total_shift++;
|
||||
}
|
||||
|
||||
if (total_shift == 3)
|
||||
return;
|
||||
}
|
||||
} while(progress);
|
||||
|
||||
/* Assign whatever's left to x */
|
||||
local_shift[0] += 3 - total_shift;
|
||||
}
|
||||
|
||||
static struct GFX_RT_SHADER_TABLE
|
||||
vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
|
||||
{
|
||||
return (struct GFX_RT_SHADER_TABLE) {
|
||||
.BaseAddress = anv_address_from_u64(region->deviceAddress),
|
||||
.Stride = region->stride,
|
||||
};
|
||||
}
|
||||
|
||||
static void
|
||||
cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
|
||||
const VkStridedDeviceAddressRegionKHR *raygen_sbt,
|
||||
const VkStridedDeviceAddressRegionKHR *miss_sbt,
|
||||
const VkStridedDeviceAddressRegionKHR *hit_sbt,
|
||||
const VkStridedDeviceAddressRegionKHR *callable_sbt,
|
||||
bool is_indirect,
|
||||
uint32_t launch_width,
|
||||
uint32_t launch_height,
|
||||
uint32_t launch_depth,
|
||||
uint64_t launch_size_addr)
|
||||
{
|
||||
struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
|
||||
struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
|
||||
|
||||
if (anv_batch_has_error(&cmd_buffer->batch))
|
||||
return;
|
||||
|
||||
/* If we have a known degenerate launch size, just bail */
|
||||
if (!is_indirect &&
|
||||
(launch_width == 0 || launch_height == 0 || launch_depth == 0))
|
||||
return;
|
||||
|
||||
genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
|
||||
genX(flush_pipeline_select_gpgpu)(cmd_buffer);
|
||||
|
||||
cmd_buffer->state.rt.pipeline_dirty = false;
|
||||
|
||||
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
|
||||
|
||||
/* Add these to the reloc list as they're internal buffers that don't
|
||||
* actually have relocs to pick them up manually.
|
||||
*
|
||||
* TODO(RT): This is a bit of a hack
|
||||
*/
|
||||
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
|
||||
cmd_buffer->batch.alloc,
|
||||
rt->scratch.bo);
|
||||
|
||||
/* Allocate and set up our RT_DISPATCH_GLOBALS */
|
||||
struct anv_state rtdg_state =
|
||||
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
|
||||
BRW_RT_PUSH_CONST_OFFSET +
|
||||
sizeof(struct anv_push_constants),
|
||||
64);
|
||||
|
||||
struct GFX_RT_DISPATCH_GLOBALS rtdg = {
|
||||
.MemBaseAddress = (struct anv_address) {
|
||||
.bo = rt->scratch.bo,
|
||||
.offset = rt->scratch.layout.ray_stack_start,
|
||||
},
|
||||
.CallStackHandler =
|
||||
anv_shader_bin_get_bsr(pipeline->trivial_return_shader, 0),
|
||||
.AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
|
||||
.NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
|
||||
.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
|
||||
.Flags = RT_DEPTH_TEST_LESS_EQUAL,
|
||||
.HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
|
||||
.MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
|
||||
.SWStackSize = rt->scratch.layout.sw_stack_size / 64,
|
||||
.LaunchWidth = launch_width,
|
||||
.LaunchHeight = launch_height,
|
||||
.LaunchDepth = launch_depth,
|
||||
.CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
|
||||
};
|
||||
GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
|
||||
|
||||
/* Push constants go after the RT_DISPATCH_GLOBALS */
|
||||
assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
|
||||
memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
|
||||
&cmd_buffer->state.rt.base.push_constants,
|
||||
sizeof(struct anv_push_constants));
|
||||
|
||||
struct anv_address rtdg_addr = {
|
||||
.bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
|
||||
.offset = rtdg_state.offset,
|
||||
};
|
||||
|
||||
uint8_t local_size_log2[3];
|
||||
uint32_t global_size[3] = {};
|
||||
if (is_indirect) {
|
||||
/* Pick a local size that's probably ok. We assume most TraceRays calls
|
||||
* will use a two-dimensional dispatch size. Worst case, our initial
|
||||
* dispatch will be a little slower than it has to be.
|
||||
*/
|
||||
local_size_log2[0] = 2;
|
||||
local_size_log2[1] = 1;
|
||||
local_size_log2[2] = 0;
|
||||
|
||||
struct mi_builder b;
|
||||
mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
|
||||
|
||||
struct mi_value launch_size[3] = {
|
||||
mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
|
||||
mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
|
||||
mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
|
||||
};
|
||||
|
||||
/* Store the original launch size into RT_DISPATCH_GLOBALS
|
||||
*
|
||||
* TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
|
||||
* moved into a genX version.
|
||||
*/
|
||||
mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
|
||||
mi_value_ref(&b, launch_size[0]));
|
||||
mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
|
||||
mi_value_ref(&b, launch_size[1]));
|
||||
mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
|
||||
mi_value_ref(&b, launch_size[2]));
|
||||
|
||||
/* Compute the global dispatch size */
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (local_size_log2[i] == 0)
|
||||
continue;
|
||||
|
||||
/* global_size = DIV_ROUND_UP(launch_size, local_size)
|
||||
*
|
||||
* Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
|
||||
* has the semantics of shifting the enture 64-bit value and taking
|
||||
* the bottom 32 so we don't have to worry about roll-over.
|
||||
*/
|
||||
uint32_t local_size = 1 << local_size_log2[i];
|
||||
launch_size[i] = mi_iadd(&b, launch_size[i],
|
||||
mi_imm(local_size - 1));
|
||||
launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
|
||||
local_size_log2[i]);
|
||||
}
|
||||
|
||||
mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
|
||||
mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
|
||||
mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
|
||||
} else {
|
||||
uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
|
||||
calc_local_trace_size(local_size_log2, launch_size);
|
||||
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
/* We have to be a bit careful here because DIV_ROUND_UP adds to the
|
||||
* numerator value may overflow. Cast to uint64_t to avoid this.
|
||||
*/
|
||||
uint32_t local_size = 1 << local_size_log2[i];
|
||||
global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
|
||||
}
|
||||
}
|
||||
|
||||
anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
|
||||
cw.IndirectParameterEnable = is_indirect;
|
||||
cw.PredicateEnable = false;
|
||||
cw.SIMDSize = SIMD8;
|
||||
cw.LocalXMaximum = (1 << local_size_log2[0]) - 1;
|
||||
cw.LocalYMaximum = (1 << local_size_log2[1]) - 1;
|
||||
cw.LocalZMaximum = (1 << local_size_log2[2]) - 1;
|
||||
cw.ThreadGroupIDXDimension = global_size[0];
|
||||
cw.ThreadGroupIDYDimension = global_size[1];
|
||||
cw.ThreadGroupIDZDimension = global_size[2];
|
||||
cw.ExecutionMask = 0xff;
|
||||
cw.EmitInlineParameter = true;
|
||||
|
||||
const gl_shader_stage s = MESA_SHADER_RAYGEN;
|
||||
struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
|
||||
struct anv_state *samplers = &cmd_buffer->state.samplers[s];
|
||||
cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
|
||||
.KernelStartPointer = pipeline->trampoline.offset,
|
||||
.SamplerStatePointer = samplers->offset,
|
||||
/* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
|
||||
.SamplerCount = 0,
|
||||
.BindingTablePointer = surfaces->offset,
|
||||
.NumberofThreadsinGPGPUThreadGroup = 1,
|
||||
.BTDMode = true,
|
||||
};
|
||||
|
||||
struct brw_rt_raygen_trampoline_params trampoline_params = {
|
||||
.rt_disp_globals_addr = anv_address_physical(rtdg_addr),
|
||||
.raygen_bsr_addr = raygen_sbt->deviceAddress,
|
||||
.is_indirect = is_indirect,
|
||||
.local_group_size_log2 = {
|
||||
local_size_log2[0],
|
||||
local_size_log2[1],
|
||||
local_size_log2[2],
|
||||
},
|
||||
};
|
||||
STATIC_ASSERT(sizeof(trampoline_params) == 32);
|
||||
memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
genX(CmdTraceRaysKHR)(
|
||||
VkCommandBuffer commandBuffer,
|
||||
|
|
@ -4939,7 +5161,16 @@ genX(CmdTraceRaysKHR)(
|
|||
uint32_t height,
|
||||
uint32_t depth)
|
||||
{
|
||||
unreachable("Unimplemented");
|
||||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
|
||||
cmd_buffer_trace_rays(cmd_buffer,
|
||||
pRaygenShaderBindingTable,
|
||||
pMissShaderBindingTable,
|
||||
pHitShaderBindingTable,
|
||||
pCallableShaderBindingTable,
|
||||
false /* is_indirect */,
|
||||
width, height, depth,
|
||||
0 /* launch_size_addr */);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -4951,8 +5182,18 @@ genX(CmdTraceRaysIndirectKHR)(
|
|||
const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
|
||||
VkDeviceAddress indirectDeviceAddress)
|
||||
{
|
||||
unreachable("Unimplemented");
|
||||
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
|
||||
|
||||
cmd_buffer_trace_rays(cmd_buffer,
|
||||
pRaygenShaderBindingTable,
|
||||
pMissShaderBindingTable,
|
||||
pHitShaderBindingTable,
|
||||
pCallableShaderBindingTable,
|
||||
true /* is_indirect */,
|
||||
0, 0, 0, /* width, height, depth, */
|
||||
indirectDeviceAddress);
|
||||
}
|
||||
#endif /* GFX_VERx10 >= 125 */
|
||||
|
||||
static void
|
||||
genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue