From 7479fe6ae0935fb8f9e3c64c16c7e6855481d1da Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 6 Aug 2020 23:05:03 -0500 Subject: [PATCH] anv: Implement vkCmdTraceRays and vkCmdTraceRaysIndirect v2: Fix anv_cmd_state::binding_tables array size (Lionel) v2: Fix anv_cmd_state::samplers array size (Lionel) Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/vulkan/anv_private.h | 4 +- src/intel/vulkan/genX_cmd_buffer.c | 245 ++++++++++++++++++++++++++++- 2 files changed, 245 insertions(+), 4 deletions(-) diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 4e9412c32cf..42d86f27c17 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -2983,8 +2983,8 @@ struct anv_cmd_state { struct anv_vertex_binding vertex_bindings[MAX_VBS]; bool xfb_enabled; struct anv_xfb_binding xfb_bindings[MAX_XFB_BUFFERS]; - struct anv_state binding_tables[MESA_SHADER_STAGES]; - struct anv_state samplers[MESA_SHADER_STAGES]; + struct anv_state binding_tables[MESA_VULKAN_SHADER_STAGES]; + struct anv_state samplers[MESA_VULKAN_SHADER_STAGES]; unsigned char sampler_sha1s[MESA_SHADER_STAGES][20]; unsigned char surface_sha1s[MESA_SHADER_STAGES][20]; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index ca7ccb39d46..28ca2b56f88 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -34,6 +34,7 @@ #include "common/intel_l3_config.h" #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" +#include "genxml/gen_rt_pack.h" #include "nir/nir_xfb_info.h" @@ -4928,6 +4929,227 @@ void genX(CmdDispatchIndirect)( emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0); } +#if GFX_VERx10 >= 125 +static void +calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3]) +{ + unsigned total_shift = 0; + memset(local_shift, 0, 3); + + bool progress; + do { + progress = false; + for (unsigned i = 0; i < 3; i++) { + assert(global[i] > 0); + if ((1 << local_shift[i]) < global[i]) { + progress = true; + local_shift[i]++; + total_shift++; + } + + if (total_shift == 3) + return; + } + } while(progress); + + /* Assign whatever's left to x */ + local_shift[0] += 3 - total_shift; +} + +static struct GFX_RT_SHADER_TABLE +vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region) +{ + return (struct GFX_RT_SHADER_TABLE) { + .BaseAddress = anv_address_from_u64(region->deviceAddress), + .Stride = region->stride, + }; +} + +static void +cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, + const VkStridedDeviceAddressRegionKHR *raygen_sbt, + const VkStridedDeviceAddressRegionKHR *miss_sbt, + const VkStridedDeviceAddressRegionKHR *hit_sbt, + const VkStridedDeviceAddressRegionKHR *callable_sbt, + bool is_indirect, + uint32_t launch_width, + uint32_t launch_height, + uint32_t launch_depth, + uint64_t launch_size_addr) +{ + struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt; + struct anv_ray_tracing_pipeline *pipeline = rt->pipeline; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + /* If we have a known degenerate launch size, just bail */ + if (!is_indirect && + (launch_width == 0 || launch_height == 0 || launch_depth == 0)) + return; + + genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); + genX(flush_pipeline_select_gpgpu)(cmd_buffer); + + cmd_buffer->state.rt.pipeline_dirty = false; + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* Add these to the reloc list as they're internal buffers that don't + * actually have relocs to pick them up manually. + * + * TODO(RT): This is a bit of a hack + */ + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->batch.alloc, + rt->scratch.bo); + + /* Allocate and set up our RT_DISPATCH_GLOBALS */ + struct anv_state rtdg_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + BRW_RT_PUSH_CONST_OFFSET + + sizeof(struct anv_push_constants), + 64); + + struct GFX_RT_DISPATCH_GLOBALS rtdg = { + .MemBaseAddress = (struct anv_address) { + .bo = rt->scratch.bo, + .offset = rt->scratch.layout.ray_stack_start, + }, + .CallStackHandler = + anv_shader_bin_get_bsr(pipeline->trivial_return_shader, 0), + .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64, + .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss, + .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, + .Flags = RT_DEPTH_TEST_LESS_EQUAL, + .HitGroupTable = vk_sdar_to_shader_table(hit_sbt), + .MissGroupTable = vk_sdar_to_shader_table(miss_sbt), + .SWStackSize = rt->scratch.layout.sw_stack_size / 64, + .LaunchWidth = launch_width, + .LaunchHeight = launch_height, + .LaunchDepth = launch_depth, + .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt), + }; + GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg); + + /* Push constants go after the RT_DISPATCH_GLOBALS */ + assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET); + memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET, + &cmd_buffer->state.rt.base.push_constants, + sizeof(struct anv_push_constants)); + + struct anv_address rtdg_addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = rtdg_state.offset, + }; + + uint8_t local_size_log2[3]; + uint32_t global_size[3] = {}; + if (is_indirect) { + /* Pick a local size that's probably ok. We assume most TraceRays calls + * will use a two-dimensional dispatch size. Worst case, our initial + * dispatch will be a little slower than it has to be. + */ + local_size_log2[0] = 2; + local_size_log2[1] = 1; + local_size_log2[2] = 0; + + struct mi_builder b; + mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); + + struct mi_value launch_size[3] = { + mi_mem32(anv_address_from_u64(launch_size_addr + 0)), + mi_mem32(anv_address_from_u64(launch_size_addr + 4)), + mi_mem32(anv_address_from_u64(launch_size_addr + 8)), + }; + + /* Store the original launch size into RT_DISPATCH_GLOBALS + * + * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets + * moved into a genX version. + */ + mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)), + mi_value_ref(&b, launch_size[0])); + mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)), + mi_value_ref(&b, launch_size[1])); + mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)), + mi_value_ref(&b, launch_size[2])); + + /* Compute the global dispatch size */ + for (unsigned i = 0; i < 3; i++) { + if (local_size_log2[i] == 0) + continue; + + /* global_size = DIV_ROUND_UP(launch_size, local_size) + * + * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm + * has the semantics of shifting the enture 64-bit value and taking + * the bottom 32 so we don't have to worry about roll-over. + */ + uint32_t local_size = 1 << local_size_log2[i]; + launch_size[i] = mi_iadd(&b, launch_size[i], + mi_imm(local_size - 1)); + launch_size[i] = mi_ushr32_imm(&b, launch_size[i], + local_size_log2[i]); + } + + mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]); + mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]); + mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]); + } else { + uint32_t launch_size[3] = { launch_width, launch_height, launch_depth }; + calc_local_trace_size(local_size_log2, launch_size); + + for (unsigned i = 0; i < 3; i++) { + /* We have to be a bit careful here because DIV_ROUND_UP adds to the + * numerator value may overflow. Cast to uint64_t to avoid this. + */ + uint32_t local_size = 1 << local_size_log2[i]; + global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size); + } + } + + anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { + cw.IndirectParameterEnable = is_indirect; + cw.PredicateEnable = false; + cw.SIMDSize = SIMD8; + cw.LocalXMaximum = (1 << local_size_log2[0]) - 1; + cw.LocalYMaximum = (1 << local_size_log2[1]) - 1; + cw.LocalZMaximum = (1 << local_size_log2[2]) - 1; + cw.ThreadGroupIDXDimension = global_size[0]; + cw.ThreadGroupIDYDimension = global_size[1]; + cw.ThreadGroupIDZDimension = global_size[2]; + cw.ExecutionMask = 0xff; + cw.EmitInlineParameter = true; + + const gl_shader_stage s = MESA_SHADER_RAYGEN; + struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s]; + struct anv_state *samplers = &cmd_buffer->state.samplers[s]; + cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { + .KernelStartPointer = pipeline->trampoline.offset, + .SamplerStatePointer = samplers->offset, + /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */ + .SamplerCount = 0, + .BindingTablePointer = surfaces->offset, + .NumberofThreadsinGPGPUThreadGroup = 1, + .BTDMode = true, + }; + + struct brw_rt_raygen_trampoline_params trampoline_params = { + .rt_disp_globals_addr = anv_address_physical(rtdg_addr), + .raygen_bsr_addr = raygen_sbt->deviceAddress, + .is_indirect = is_indirect, + .local_group_size_log2 = { + local_size_log2[0], + local_size_log2[1], + local_size_log2[2], + }, + }; + STATIC_ASSERT(sizeof(trampoline_params) == 32); + memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params)); + } +} + void genX(CmdTraceRaysKHR)( VkCommandBuffer commandBuffer, @@ -4939,7 +5161,16 @@ genX(CmdTraceRaysKHR)( uint32_t height, uint32_t depth) { - unreachable("Unimplemented"); + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer_trace_rays(cmd_buffer, + pRaygenShaderBindingTable, + pMissShaderBindingTable, + pHitShaderBindingTable, + pCallableShaderBindingTable, + false /* is_indirect */, + width, height, depth, + 0 /* launch_size_addr */); } void @@ -4951,8 +5182,18 @@ genX(CmdTraceRaysIndirectKHR)( const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, VkDeviceAddress indirectDeviceAddress) { - unreachable("Unimplemented"); + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer_trace_rays(cmd_buffer, + pRaygenShaderBindingTable, + pMissShaderBindingTable, + pHitShaderBindingTable, + pCallableShaderBindingTable, + true /* is_indirect */, + 0, 0, 0, /* width, height, depth, */ + indirectDeviceAddress); } +#endif /* GFX_VERx10 >= 125 */ static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,