anv: Implement vkCmdTraceRays and vkCmdTraceRaysIndirect

v2: Fix anv_cmd_state::binding_tables array size (Lionel)

v2: Fix anv_cmd_state::samplers array size (Lionel)

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8637>
This commit is contained in:
Jason Ekstrand 2020-08-06 23:05:03 -05:00 committed by Marge Bot
parent ac6d7a1758
commit 7479fe6ae0
2 changed files with 245 additions and 4 deletions

View file

@ -2983,8 +2983,8 @@ struct anv_cmd_state {
struct anv_vertex_binding vertex_bindings[MAX_VBS]; struct anv_vertex_binding vertex_bindings[MAX_VBS];
bool xfb_enabled; bool xfb_enabled;
struct anv_xfb_binding xfb_bindings[MAX_XFB_BUFFERS]; struct anv_xfb_binding xfb_bindings[MAX_XFB_BUFFERS];
struct anv_state binding_tables[MESA_SHADER_STAGES]; struct anv_state binding_tables[MESA_VULKAN_SHADER_STAGES];
struct anv_state samplers[MESA_SHADER_STAGES]; struct anv_state samplers[MESA_VULKAN_SHADER_STAGES];
unsigned char sampler_sha1s[MESA_SHADER_STAGES][20]; unsigned char sampler_sha1s[MESA_SHADER_STAGES][20];
unsigned char surface_sha1s[MESA_SHADER_STAGES][20]; unsigned char surface_sha1s[MESA_SHADER_STAGES][20];

View file

@ -34,6 +34,7 @@
#include "common/intel_l3_config.h" #include "common/intel_l3_config.h"
#include "genxml/gen_macros.h" #include "genxml/gen_macros.h"
#include "genxml/genX_pack.h" #include "genxml/genX_pack.h"
#include "genxml/gen_rt_pack.h"
#include "nir/nir_xfb_info.h" #include "nir/nir_xfb_info.h"
@ -4928,6 +4929,227 @@ void genX(CmdDispatchIndirect)(
emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0); emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
} }
#if GFX_VERx10 >= 125
static void
calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
{
unsigned total_shift = 0;
memset(local_shift, 0, 3);
bool progress;
do {
progress = false;
for (unsigned i = 0; i < 3; i++) {
assert(global[i] > 0);
if ((1 << local_shift[i]) < global[i]) {
progress = true;
local_shift[i]++;
total_shift++;
}
if (total_shift == 3)
return;
}
} while(progress);
/* Assign whatever's left to x */
local_shift[0] += 3 - total_shift;
}
static struct GFX_RT_SHADER_TABLE
vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
{
return (struct GFX_RT_SHADER_TABLE) {
.BaseAddress = anv_address_from_u64(region->deviceAddress),
.Stride = region->stride,
};
}
static void
cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
const VkStridedDeviceAddressRegionKHR *raygen_sbt,
const VkStridedDeviceAddressRegionKHR *miss_sbt,
const VkStridedDeviceAddressRegionKHR *hit_sbt,
const VkStridedDeviceAddressRegionKHR *callable_sbt,
bool is_indirect,
uint32_t launch_width,
uint32_t launch_height,
uint32_t launch_depth,
uint64_t launch_size_addr)
{
struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
if (anv_batch_has_error(&cmd_buffer->batch))
return;
/* If we have a known degenerate launch size, just bail */
if (!is_indirect &&
(launch_width == 0 || launch_height == 0 || launch_depth == 0))
return;
genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
genX(flush_pipeline_select_gpgpu)(cmd_buffer);
cmd_buffer->state.rt.pipeline_dirty = false;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
/* Add these to the reloc list as they're internal buffers that don't
* actually have relocs to pick them up manually.
*
* TODO(RT): This is a bit of a hack
*/
anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
cmd_buffer->batch.alloc,
rt->scratch.bo);
/* Allocate and set up our RT_DISPATCH_GLOBALS */
struct anv_state rtdg_state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
BRW_RT_PUSH_CONST_OFFSET +
sizeof(struct anv_push_constants),
64);
struct GFX_RT_DISPATCH_GLOBALS rtdg = {
.MemBaseAddress = (struct anv_address) {
.bo = rt->scratch.bo,
.offset = rt->scratch.layout.ray_stack_start,
},
.CallStackHandler =
anv_shader_bin_get_bsr(pipeline->trivial_return_shader, 0),
.AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
.NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
.MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
.Flags = RT_DEPTH_TEST_LESS_EQUAL,
.HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
.MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
.SWStackSize = rt->scratch.layout.sw_stack_size / 64,
.LaunchWidth = launch_width,
.LaunchHeight = launch_height,
.LaunchDepth = launch_depth,
.CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
};
GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
/* Push constants go after the RT_DISPATCH_GLOBALS */
assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
&cmd_buffer->state.rt.base.push_constants,
sizeof(struct anv_push_constants));
struct anv_address rtdg_addr = {
.bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
.offset = rtdg_state.offset,
};
uint8_t local_size_log2[3];
uint32_t global_size[3] = {};
if (is_indirect) {
/* Pick a local size that's probably ok. We assume most TraceRays calls
* will use a two-dimensional dispatch size. Worst case, our initial
* dispatch will be a little slower than it has to be.
*/
local_size_log2[0] = 2;
local_size_log2[1] = 1;
local_size_log2[2] = 0;
struct mi_builder b;
mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
struct mi_value launch_size[3] = {
mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
};
/* Store the original launch size into RT_DISPATCH_GLOBALS
*
* TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
* moved into a genX version.
*/
mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
mi_value_ref(&b, launch_size[0]));
mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
mi_value_ref(&b, launch_size[1]));
mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
mi_value_ref(&b, launch_size[2]));
/* Compute the global dispatch size */
for (unsigned i = 0; i < 3; i++) {
if (local_size_log2[i] == 0)
continue;
/* global_size = DIV_ROUND_UP(launch_size, local_size)
*
* Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
* has the semantics of shifting the enture 64-bit value and taking
* the bottom 32 so we don't have to worry about roll-over.
*/
uint32_t local_size = 1 << local_size_log2[i];
launch_size[i] = mi_iadd(&b, launch_size[i],
mi_imm(local_size - 1));
launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
local_size_log2[i]);
}
mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
} else {
uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
calc_local_trace_size(local_size_log2, launch_size);
for (unsigned i = 0; i < 3; i++) {
/* We have to be a bit careful here because DIV_ROUND_UP adds to the
* numerator value may overflow. Cast to uint64_t to avoid this.
*/
uint32_t local_size = 1 << local_size_log2[i];
global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
}
}
anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
cw.IndirectParameterEnable = is_indirect;
cw.PredicateEnable = false;
cw.SIMDSize = SIMD8;
cw.LocalXMaximum = (1 << local_size_log2[0]) - 1;
cw.LocalYMaximum = (1 << local_size_log2[1]) - 1;
cw.LocalZMaximum = (1 << local_size_log2[2]) - 1;
cw.ThreadGroupIDXDimension = global_size[0];
cw.ThreadGroupIDYDimension = global_size[1];
cw.ThreadGroupIDZDimension = global_size[2];
cw.ExecutionMask = 0xff;
cw.EmitInlineParameter = true;
const gl_shader_stage s = MESA_SHADER_RAYGEN;
struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
struct anv_state *samplers = &cmd_buffer->state.samplers[s];
cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
.KernelStartPointer = pipeline->trampoline.offset,
.SamplerStatePointer = samplers->offset,
/* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
.SamplerCount = 0,
.BindingTablePointer = surfaces->offset,
.NumberofThreadsinGPGPUThreadGroup = 1,
.BTDMode = true,
};
struct brw_rt_raygen_trampoline_params trampoline_params = {
.rt_disp_globals_addr = anv_address_physical(rtdg_addr),
.raygen_bsr_addr = raygen_sbt->deviceAddress,
.is_indirect = is_indirect,
.local_group_size_log2 = {
local_size_log2[0],
local_size_log2[1],
local_size_log2[2],
},
};
STATIC_ASSERT(sizeof(trampoline_params) == 32);
memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
}
}
void void
genX(CmdTraceRaysKHR)( genX(CmdTraceRaysKHR)(
VkCommandBuffer commandBuffer, VkCommandBuffer commandBuffer,
@ -4939,7 +5161,16 @@ genX(CmdTraceRaysKHR)(
uint32_t height, uint32_t height,
uint32_t depth) uint32_t depth)
{ {
unreachable("Unimplemented"); ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
cmd_buffer_trace_rays(cmd_buffer,
pRaygenShaderBindingTable,
pMissShaderBindingTable,
pHitShaderBindingTable,
pCallableShaderBindingTable,
false /* is_indirect */,
width, height, depth,
0 /* launch_size_addr */);
} }
void void
@ -4951,8 +5182,18 @@ genX(CmdTraceRaysIndirectKHR)(
const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
VkDeviceAddress indirectDeviceAddress) VkDeviceAddress indirectDeviceAddress)
{ {
unreachable("Unimplemented"); ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
cmd_buffer_trace_rays(cmd_buffer,
pRaygenShaderBindingTable,
pMissShaderBindingTable,
pHitShaderBindingTable,
pCallableShaderBindingTable,
true /* is_indirect */,
0, 0, 0, /* width, height, depth, */
indirectDeviceAddress);
} }
#endif /* GFX_VERx10 >= 125 */
static void static void
genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,