From 1281e2b9a0959ede17f052f6b763f4734a8105dd Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Fri, 24 May 2024 11:22:49 +0300 Subject: [PATCH] anv/intel: add device generated commands shaders Signed-off-by: Lionel Landwerlin Acked-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/intel_shader_enums.h | 1 + src/intel/genxml/genX_rt_cl_pack.h | 6 +- src/intel/genxml/meson.build | 35 + src/intel/shaders/dgc.cl | 947 ++++++++++++++++++++++++ src/intel/shaders/generate.cl | 66 ++ src/intel/shaders/libintel_shaders.h | 38 +- src/intel/shaders/meson.build | 2 + src/intel/vulkan/anv_types.h | 262 +++++++ 8 files changed, 1352 insertions(+), 5 deletions(-) create mode 100644 src/intel/shaders/dgc.cl diff --git a/src/intel/compiler/intel_shader_enums.h b/src/intel/compiler/intel_shader_enums.h index 5e1bd0f12ba..de04d69514b 100644 --- a/src/intel/compiler/intel_shader_enums.h +++ b/src/intel/compiler/intel_shader_enums.h @@ -11,6 +11,7 @@ #endif #include "compiler/shader_enums.h" +#include "util/bitscan.h" #include "util/enum_operators.h" #ifdef __cplusplus diff --git a/src/intel/genxml/genX_rt_cl_pack.h b/src/intel/genxml/genX_rt_cl_pack.h index 097de558b74..1ccfb9c7c84 100644 --- a/src/intel/genxml/genX_rt_cl_pack.h +++ b/src/intel/genxml/genX_rt_cl_pack.h @@ -14,9 +14,11 @@ #elif (GFX_VERx10 == 125) # include "genxml/gen125_rt_cl_pack.h" #elif (GFX_VERx10 == 200) -# include "genxml/gen200_rt_cl_pack.h" +# include "genxml/xe2_rt_cl_pack.h" #elif (GFX_VERx10 == 300) -# include "genxml/gen300_rt_cl_pack.h" +# include "genxml/xe3_rt_cl_pack.h" +#elif (GFX_VERx10 == 350) +# include "genxml/xe3p_rt_cl_pack.h" #else # error "Need to add a pack header include for this gen" #endif diff --git a/src/intel/genxml/meson.build b/src/intel/genxml/meson.build index 534d50a4231..5b934a6c531 100644 --- a/src/intel/genxml/meson.build +++ b/src/intel/genxml/meson.build @@ -128,15 +128,50 @@ endforeach genX_cl_included_symbols = [ # instructions + '3DMESH_3D', + '3DSTATE_CLIP', + '3DSTATE_CONSTANT_ALL', + '3DSTATE_CONSTANT_VS', '3DSTATE_DS', + '3DSTATE_GS', '3DSTATE_HS', '3DSTATE_INDEX_BUFFER', + '3DSTATE_MESH_CONTROL', + '3DSTATE_MESH_SHADER_DATA', + '3DSTATE_PS', + '3DSTATE_PS_EXTRA', + '3DSTATE_PS_BLEND', + '3DSTATE_RASTER', + '3DSTATE_SF', + '3DSTATE_STREAMOUT', + '3DSTATE_TASK_CONTROL', + '3DSTATE_TASK_SHADER_DATA', + '3DSTATE_TE', '3DSTATE_VERTEX_BUFFERS', + '3DSTATE_VF_TOPOLOGY', + '3DSTATE_VFG', + '3DSTATE_VS', + '3DSTATE_WM', '3DPRIMITIVE', '3DPRIMITIVE_EXTENDED', + 'COMPUTE_WALKER', + 'GPGPU_WALKER', + 'MEDIA_CURBE_LOAD', + 'MEDIA_INTERFACE_DESCRIPTOR_LOAD', + 'MEDIA_STATE_FLUSH', 'MI_ARB_CHECK', 'MI_BATCH_BUFFER_START', + 'MI_STORE_DATA_IMM', # structures + '3DSTATE_CONSTANT_ALL_DATA', + '3DSTATE_CONSTANT_BODY', + 'BINDLESS_SHADER_RECORD', + 'CALL_STACK_HANDLER', + 'COMPUTE_WALKER_BODY', + 'INTERFACE_DESCRIPTOR_DATA', + 'POSTSYNC_DATA', + 'RT_DISPATCH_GLOBALS', + 'RT_SHADER_TABLE', 'VERTEX_BUFFER_STATE', ] diff --git a/src/intel/shaders/dgc.cl b/src/intel/shaders/dgc.cl new file mode 100644 index 00000000000..f288a01c737 --- /dev/null +++ b/src/intel/shaders/dgc.cl @@ -0,0 +1,947 @@ +/* + * Copyright 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "libintel_shaders.h" +#include "dev/intel_wa.h" +#include "vulkan/anv_types.h" + +#define HAS_STAGE(descriptor, stage) \ + (((descriptor)->active_stages & \ + BITFIELD_BIT(ANV_DGC_STAGE_##stage)) != 0) + +#if GFX_VER >= 11 + +static void +merge_dwords(global void *dst, global void *src1, global void *src2, uint32_t n_dwords) +{ + for (uint32_t i = 0; i < n_dwords; i += 4) { + if (n_dwords - i >= 4) { + *(global uint4 *)(dst + i * 4) = *(global uint4 *)(src1 + i * 4) | + *(global uint4 *)(src2 + i * 4) ; + } else if (n_dwords - i >= 3) { + *(global uint3 *)(dst + i * 4) = *(global uint3 *)(src1 + i * 4) | + *(global uint3 *)(src2 + i * 4) ; + } else if (n_dwords - i >= 2) { + *(global uint2 *)(dst + i * 4) = *(global uint2 *)(src1 + i * 4) | + *(global uint2 *)(src2 + i * 4) ; + } else { + *(global uint *)(dst + i * 4) = *(global uint *)(src1 + i * 4) | + *(global uint *)(src2 + i * 4) ; + } + } +} + +#if GFX_VER >= 12 +static uint32_t +write_3DSTATE_CONSTANT_ALL(global void *dst_ptr, + global void *push_data_addr, + global struct anv_dgc_push_stage_state *stage_state, + global struct anv_dgc_gfx_state *state, + uint32_t stage_enabled) +{ + uint32_t n_slots = stage_state->legacy.n_slots; + struct GENX(3DSTATE_CONSTANT_ALL) v = { + GENX(3DSTATE_CONSTANT_ALL_header), + .DWordLength = GENX(3DSTATE_CONSTANT_ALL_length) - + GENX(3DSTATE_CONSTANT_ALL_length_bias) + + n_slots * GENX(3DSTATE_CONSTANT_ALL_DATA_length), + .ShaderUpdateEnable = stage_enabled, + .MOCS = state->layout.push_constants.mocs, + .PointerBufferMask = (1u << n_slots) - 1, + }; + GENX(3DSTATE_CONSTANT_ALL_pack)(dst_ptr, &v); + + dst_ptr += GENX(3DSTATE_CONSTANT_ALL_length) * 4; + + for (uint32_t i = 0; i < n_slots; i++) { + struct anv_dgc_push_stage_slot slot = stage_state->legacy.slots[i]; + + if (slot.type == ANV_DGC_PUSH_SLOT_TYPE_PUSH_CONSTANTS) { + struct GENX(3DSTATE_CONSTANT_ALL_DATA) vd = { + .ConstantBufferReadLength = slot.push_data_size / 32, + .PointerToConstantBuffer = (uint64_t) push_data_addr + slot.push_data_offset, + }; + GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(dst_ptr, &vd); + } else { + struct GENX(3DSTATE_CONSTANT_ALL_DATA) vd = { + .ConstantBufferReadLength = slot.push_data_size / 32, + .PointerToConstantBuffer = state->push_constants.addresses[i], + }; + GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(dst_ptr, &vd); + } + + dst_ptr += GENX(3DSTATE_CONSTANT_ALL_DATA_length) * 4; + } + + return 4 * (GENX(3DSTATE_CONSTANT_ALL_length) + + n_slots * GENX(3DSTATE_CONSTANT_ALL_DATA_length)); +} +#else +static uint64_t +pc_slot_address(global struct anv_dgc_push_stage_slot *slot, + global uint64_t *slot_address, + global void *push_data_addr) +{ + if (slot->type == ANV_DGC_PUSH_SLOT_TYPE_PUSH_CONSTANTS) { + return (uint64_t) push_data_addr + slot->push_data_offset; + } else { + return *slot_address; + } +} + +static uint32_t +write_3DSTATE_CONSTANT_XS(global void *dst_ptr, + global void *push_data_addr, + global struct anv_dgc_push_stage_state *stage_state, + global struct anv_dgc_gfx_state *state, + uint32_t stage_enabled) +{ + uint32_t opcode; + if (stage_enabled & BITFIELD_BIT(ANV_DGC_STAGE_VERTEX)) + opcode = 21; + else if (stage_enabled & BITFIELD_BIT(ANV_DGC_STAGE_TESS_CTRL)) + opcode = 25; + else if (stage_enabled & BITFIELD_BIT(ANV_DGC_STAGE_TESS_EVAL)) + opcode = 26; + else if (stage_enabled & BITFIELD_BIT(ANV_DGC_STAGE_GEOMETRY)) + opcode = 22; + else + opcode = 23; + + struct GENX(3DSTATE_CONSTANT_VS) v = { + GENX(3DSTATE_CONSTANT_VS_header), + ._3DCommandSubOpcode = opcode, + .ConstantBody = { + .Buffer = { + pc_slot_address(&stage_state->legacy.slots[0], + &state->push_constants.addresses[0], + push_data_addr), + pc_slot_address(&stage_state->legacy.slots[1], + &state->push_constants.addresses[1], + push_data_addr), + pc_slot_address(&stage_state->legacy.slots[2], + &state->push_constants.addresses[2], + push_data_addr), + pc_slot_address(&stage_state->legacy.slots[3], + &state->push_constants.addresses[3], + push_data_addr), + }, + .ReadLength = { + stage_state->legacy.slots[0].push_data_size / 32, + stage_state->legacy.slots[1].push_data_size / 32, + stage_state->legacy.slots[2].push_data_size / 32, + stage_state->legacy.slots[3].push_data_size / 32, + }, + }, + }; + GENX(3DSTATE_CONSTANT_VS_pack)(dst_ptr, &v); + + return 4 * GENX(3DSTATE_CONSTANT_VS_length); +} +#endif + +static void +write_app_push_constant_data(global void *push_data_ptr, + global struct anv_dgc_push_layout *pc_layout, + global void *seq_ptr, + global void *template_ptr, + uint32_t template_size, + uint32_t seq_idx) +{ + uint32_t num_entries = pc_layout->num_entries; + + /* Copy the push constant data prepared on the CPU into the preprocess + * buffer. Try to minimize the amount if the first entry partially or + * entirely overlaps. + */ + if (template_size > 0) { + if (num_entries > 0) { + struct anv_dgc_push_entry first_entry = pc_layout->entries[0]; + uint32_t entry_end = first_entry.push_offset + first_entry.size; + if (first_entry.push_offset > 0) { + genX(copy_data)(push_data_ptr, template_ptr, + first_entry.push_offset); + } + if (entry_end < template_size) { + genX(copy_data)(push_data_ptr + entry_end, + template_ptr + entry_end, + template_size - entry_end); + } + } else { + genX(copy_data)(push_data_ptr, template_ptr, template_size); + } + } + + /* Update push constant data using the indirect stream */ + for (uint32_t i = 0; i < num_entries; i++) { + struct anv_dgc_push_entry entry = pc_layout->entries[i]; + global void *pc_ptr = seq_ptr + entry.seq_offset; + genX(copy_data)(push_data_ptr + entry.push_offset, + pc_ptr, entry.size); + } + + if (pc_layout->seq_id_active) + *(uint32_t *)(push_data_ptr + pc_layout->seq_id_offset) = seq_idx; +} + +static void +write_cs_drv_push_constant_data(global struct anv_push_constants *push_data_ptr, + global void *driver_template_ptr, + uint32_t offset, uint32_t size, + global VkDispatchIndirectCommand *info) +{ + genX(copy_data)(&push_data_ptr->client_data[offset], + driver_template_ptr, size); + +#if GFX_VERx10 >= 125 + /* On Gfx12.5+ we always have the entire push constant space, so it's fine to copy */ + push_data_ptr->cs.num_workgroups[0] = info->x; + push_data_ptr->cs.num_workgroups[1] = info->y; + push_data_ptr->cs.num_workgroups[2] = info->z; +#else + /* Prior to Gfx12.5, the push constant data has to be aligned to 64B and + * the beginning is based off the first location the shader needs. So if + * the read location is does not include the workgroup, don't write it, we + * would be overwriting some other data in the generated commands/data. + */ + if (offset <= offsetof(struct anv_push_constants, cs.num_workgroups[0])) { + push_data_ptr->cs.num_workgroups[0] = info->x; + push_data_ptr->cs.num_workgroups[1] = info->y; + push_data_ptr->cs.num_workgroups[2] = info->z; + } +#endif +} + +static void +write_rt_drv_push_constant_data(global void *driver_data_ptr, + global void *driver_template_ptr, + uint32_t size) +{ + genX(copy_data)(driver_data_ptr, driver_template_ptr, size); +} + +static void +write_gfx_drv_push_constant_data(global void *driver_data_ptr, + global void *driver_template_ptr, + uint32_t size) +{ + genX(copy_data)(driver_data_ptr, driver_template_ptr, size); +} + +static uint32_t +write_gfx_push_constant_commands(global void *push_cmd_ptr, + global void *push_data_ptr, + global struct anv_dgc_gfx_state *state) +{ + uint32_t cmd_offset = 0; + uint32_t push_stages = state->descriptor.push_constants.active_stages; + for (uint32_t s = ANV_DGC_STAGE_VERTEX; + s <= ANV_DGC_STAGE_FRAGMENT && push_stages != 0; s++) { + if ((BITFIELD_BIT(s) & push_stages) == 0) + continue; + + global struct anv_dgc_push_stage_state *stage_state = + &state->descriptor.push_constants.stages[s]; + +#if GFX_VER >= 12 + cmd_offset += write_3DSTATE_CONSTANT_ALL(push_cmd_ptr + cmd_offset, + push_data_ptr, + stage_state, + state, + BITFIELD_BIT(s)); +#else + cmd_offset += write_3DSTATE_CONSTANT_XS(push_cmd_ptr + cmd_offset, + push_data_ptr, + stage_state, + state, + BITFIELD_BIT(s)); +#endif + + push_stages &= ~BITFIELD_BIT(s); + } + +#if GFX_VERx10 >= 125 + /* Mesh & Task use a single combined push constants + driver constants + * pointer + */ + if (push_stages & BITFIELD_BIT(ANV_DGC_STAGE_TASK)) { + struct anv_dgc_push_bindless_stage pc = + state->descriptor.push_constants.stages[ANV_DGC_STAGE_TASK].bindless; + uint64_t pc_addr = (uint64_t) push_data_ptr + pc.push_data_offset; + struct GENX(3DSTATE_TASK_SHADER_DATA) data = { + GENX(3DSTATE_TASK_SHADER_DATA_header), + .InlineData = { + pc.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ? + pc_addr & 0xffffffff : ((global uint32_t *)push_data_ptr)[pc.inline_dwords[0]], + pc.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ? + pc_addr >> 32 : ((global uint32_t *)push_data_ptr)[pc.inline_dwords[1]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[2]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[3]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[4]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[5]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[6]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[7]], + }, + }; + GENX(3DSTATE_TASK_SHADER_DATA_pack)(push_cmd_ptr + cmd_offset, &data); + cmd_offset += GENX(3DSTATE_TASK_SHADER_DATA_length) * 4; + } + + + if (push_stages & BITFIELD_BIT(ANV_DGC_STAGE_MESH)) { + struct anv_dgc_push_bindless_stage pc = + state->descriptor.push_constants.stages[ANV_DGC_STAGE_MESH].bindless; + uint64_t pc_addr = (uint64_t) push_data_ptr + pc.push_data_offset; + struct GENX(3DSTATE_MESH_SHADER_DATA) data = { + GENX(3DSTATE_MESH_SHADER_DATA_header), + .InlineData = { + pc.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ? pc_addr & 0xffffffff : + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[0]], + pc.inline_dwords[1] == ANV_INLINE_DWORD_PUSH_ADDRESS_UDW ? pc_addr >> 32 : + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[1]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[2]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[3]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[4]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[5]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[6]], + ((global uint32_t *)push_data_ptr)[pc.inline_dwords[7]], + }, + }; + GENX(3DSTATE_MESH_SHADER_DATA_pack)(push_cmd_ptr + cmd_offset, &data); + cmd_offset += GENX(3DSTATE_MESH_SHADER_DATA_length) * 4; +#undef PVDW_OR + } +#endif + + return cmd_offset; +} + +static global void * +get_ptr(global void *base, uint32_t stride, + uint32_t prolog_size, uint32_t seq_idx) +{ + return base + prolog_size + seq_idx * stride; +} + +static void +write_prolog_epilog(global void *cmd_base, uint32_t cmd_stride, + uint32_t max_count, uint32_t cmd_prolog_size, + uint32_t seq_idx, uint64_t return_addr) +{ + /* A write to the location of the MI_BATCH_BUFFER_START below. */ + genX(write_address)(cmd_base, + get_ptr(cmd_base, cmd_stride, + cmd_prolog_size, max_count) + 4, + return_addr); + + global void *next_addr = cmd_base + (GENX(MI_STORE_DATA_IMM_length) + 1 + + GENX(MI_BATCH_BUFFER_START_length)) * 4; + + genX(write_MI_BATCH_BUFFER_START)( + cmd_base + (GENX(MI_STORE_DATA_IMM_length) + 1) * 4, + (uint64_t)next_addr); + + /* Reenable the prefetcher. */ +#if GFX_VER >= 12 + struct GENX(MI_ARB_CHECK) v = { + GENX(MI_ARB_CHECK_header), + /* This is a trick to get the CLC->SPIRV not to use a constant variable + * for this. Otherwise we run into issues trying to store that variable + * in constant memory which is inefficient for a single dword and also + * not handled in our backend. + */ + .PreParserDisableMask = seq_idx == 0, + .PreParserDisable = false, + }; + GENX(MI_ARB_CHECK_pack)(next_addr, &v); +#endif + + /* This is the epilog, returning to the main batch. */ + genX(write_MI_BATCH_BUFFER_START)( + get_ptr(cmd_base, cmd_stride, cmd_prolog_size, max_count), + return_addr); +} + +static void +write_return_addr(global void *cmd_base, uint32_t cmd_stride, + uint32_t max_count, uint32_t cmd_prolog_size, + uint64_t return_addr) +{ + /* A write to the location of the MI_BATCH_BUFFER_START below. */ + genX(write_address)(cmd_base, + get_ptr(cmd_base, cmd_stride, + cmd_prolog_size, max_count) + 4, + return_addr); +} + +void +genX(libanv_preprocess_gfx_generate)(global void *cmd_base, + uint32_t cmd_stride, + global void *data_base, + uint32_t data_stride, + global void *seq_base, + uint32_t seq_stride, + global uint32_t *seq_count, + uint32_t max_seq_count, + uint32_t cmd_prolog_size, + uint32_t data_prolog_size, + global struct anv_dgc_gfx_state *state, + global void *const_ptr, + uint32_t const_size, + global void *driver_const_ptr, + uint64_t return_addr, + uint32_t flags, + uint32_t seq_idx) +{ + uint32_t max_count = seq_count != 0 ? min(*seq_count, max_seq_count) : max_seq_count; + + if (seq_idx == 0) { + write_prolog_epilog(cmd_base, cmd_stride, max_count, + cmd_prolog_size, seq_idx, return_addr); + } + + if (seq_idx >= max_count) + return; + + /* Pointer to the stream data, layed out as described in stream_layout. */ + global void *seq_ptr = seq_base + seq_idx * seq_stride; + + /* Where to write the commands */ + global void *cmd_ptr = + get_ptr(cmd_base, cmd_stride, cmd_prolog_size, seq_idx); + + /* 3DSTATE_INDEX_BUFFER */ + struct anv_dgc_index_buffer index_buffer = state->layout.index_buffer; + if (index_buffer.cmd_size != 0) { + VkBindIndexBufferIndirectCommandEXT idx_data = + *(global VkBindIndexBufferIndirectCommandEXT *)( + seq_ptr + index_buffer.seq_offset); + + uint32_t index_format = + index_buffer.u32_value == idx_data.indexType ? INDEX_DWORD : + index_buffer.u16_value == idx_data.indexType ? INDEX_WORD : + index_buffer.u8_value == idx_data.indexType ? INDEX_BYTE : + INDEX_BYTE; + + genX(write_3DSTATE_INDEX_BUFFER)(cmd_ptr + index_buffer.cmd_offset, + idx_data.bufferAddress, + idx_data.size, + index_format, + index_buffer.mocs); + } + + /* 3DSTATE_VERTEX_BUFFERS */ + uint32_t n_vertex_buffers = state->layout.vertex_buffers.n_buffers; + if (n_vertex_buffers) { + global void *cmd_vb = cmd_ptr + state->layout.vertex_buffers.cmd_offset; + + genX(write_3DSTATE_VERTEX_BUFFERS)(cmd_vb, n_vertex_buffers); + cmd_vb += 4; + + uint16_t mocs = state->layout.vertex_buffers.mocs; + for (uint32_t i = 0; i < n_vertex_buffers; i++) { + struct anv_dgc_vertex_buffer vb = state->layout.vertex_buffers.buffers[i]; + + VkBindVertexBufferIndirectCommandEXT vtx_data = + *(global VkBindVertexBufferIndirectCommandEXT *)( + seq_ptr + vb.seq_offset); + + genX(write_VERTEX_BUFFER_STATE)(cmd_vb, mocs, vb.binding, + vtx_data.bufferAddress, + vtx_data.size, + vtx_data.stride); + cmd_vb += GENX(VERTEX_BUFFER_STATE_length) * 4; + } + } + +#if INTEL_WA_16011107343_GFX_VER || INTEL_WA_22018402687_GFX_VER + genX(copy_data)(cmd_ptr + state->layout.indirect_set.final_cmds_offset, + state->descriptor.final_commands, + state->layout.indirect_set.final_cmds_size); +#endif + + /* Push constants */ + enum anv_dgc_push_constant_flags pc_flags = + state->layout.push_constants.flags; + if (pc_flags & ANV_DGC_PUSH_CONSTANTS_CMD_ACTIVE) { + global void *push_data_ptr = + get_ptr(data_base, data_stride, data_prolog_size, seq_idx) + + state->layout.push_constants.data_offset; + + write_app_push_constant_data(push_data_ptr, + &state->layout.push_constants, + seq_ptr, const_ptr, + const_size, seq_idx); + write_gfx_drv_push_constant_data( + push_data_ptr + MAX_PUSH_CONSTANTS_SIZE, + driver_const_ptr, ANV_DRIVER_PUSH_CONSTANTS_SIZE); + + write_gfx_push_constant_commands(cmd_ptr + + state->layout.push_constants.cmd_offset, + push_data_ptr, + state); + } + + /* 3DPRIMITIVE / 3DMESH_3D */ + bool is_predicated = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0; + bool tbimr_enabled = (flags & ANV_GENERATED_FLAG_TBIMR) != 0; + switch (state->layout.draw.draw_type) { + case ANV_DGC_DRAW_TYPE_SEQUENTIAL: + genX(write_draw)(cmd_ptr + state->layout.draw.cmd_offset, + seq_ptr + state->layout.draw.seq_offset, + 0 /* draw_id_ptr */, + 0 /* draw_id, always 0 per spec */, + state->draw.instance_multiplier, + false /* indexed */, + is_predicated, + tbimr_enabled, + true /* uses_base, unused for Gfx11+ */, + true /* uses_draw_id, unused for Gfx11+ */, + 0 /* mocs, unused for Gfx11+ */); + break; + + case ANV_DGC_DRAW_TYPE_INDEXED: + genX(write_draw)(cmd_ptr + state->layout.draw.cmd_offset, + seq_ptr + state->layout.draw.seq_offset, + 0 /* draw_id_ptr */, + 0 /* draw_id, always 0 per spec */, + state->draw.instance_multiplier, + true /* indexed */, + is_predicated, + tbimr_enabled, + true /* uses_base, unused for Gfx11+ */, + true /* uses_draw_id, unused for Gfx11+ */, + 0 /* mocs, unused for Gfx11+ */); + break; + +#if GFX_VERx10 >= 125 + case ANV_DGC_DRAW_TYPE_MESH: + genX(write_3DMESH_3D)(cmd_ptr + state->layout.draw.cmd_offset, + seq_ptr + state->layout.draw.seq_offset, + is_predicated, + tbimr_enabled); + break; +#endif + } +} + +#if GFX_VERx10 >= 125 +static void +emit_dispatch_commands(global void *cmd_base, + uint32_t cmd_stride, + uint32_t seq_idx, + uint32_t prolog_size, + global void *push_data_ptr, + global struct anv_dgc_cs_layout *layout, + global struct anv_dgc_cs_descriptor *descriptor, + global void *interface_descriptor_data_ptr, + uint32_t flags, + global VkDispatchIndirectCommand *info) +{ + global void *cmd_ptr = get_ptr(cmd_base, cmd_stride, prolog_size, seq_idx); + + uint64_t pc_addr = (uint64_t)push_data_ptr + descriptor->push_data_offset; + + struct GENX(COMPUTE_WALKER) v = { + .PredicateEnable = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0, + .body = { + .ThreadGroupIDXDimension = info->x, + .ThreadGroupIDYDimension = info->y, + .ThreadGroupIDZDimension = info->z, + .ExecutionMask = descriptor->right_mask, + .InlineData = { + descriptor->gfx125.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ? + pc_addr & 0xffffffff : ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[0]], + descriptor->gfx125.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ? + pc_addr >> 32 : ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[1]], + ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[2]], + ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[3]], + ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[4]], + ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[5]], + ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[6]], + ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[7]], + }, + }, + }; + GENX(COMPUTE_WALKER_repack)(cmd_ptr, descriptor->gfx125.compute_walker, &v); +} +#else +static void +emit_dispatch_commands(global void *cmd_base, + uint32_t cmd_stride, + uint32_t seq_idx, + uint32_t cmd_prolog_size, + global void *data_ptr, + global struct anv_dgc_cs_layout *layout, + global struct anv_dgc_cs_descriptor *descriptor, + global void *interface_descriptor_data_ptr, + uint32_t flags, + global VkDispatchIndirectCommand *info) +{ + global void *cmd_ptr = get_ptr(cmd_base, cmd_stride, cmd_prolog_size, seq_idx); + + if (layout->indirect_set.active != 0) { + /* Emit MEDIA_VFE_STATE either for each sequence */ + genX(copy_data)(cmd_ptr, descriptor->gfx9.media_vfe_state, + sizeof(descriptor->gfx9.media_vfe_state)); + cmd_ptr += sizeof(descriptor->gfx9.media_vfe_state); + + /* Load the shader descriptor */ + global void *idd_ptr = data_ptr + layout->indirect_set.data_offset; + merge_dwords(idd_ptr, + interface_descriptor_data_ptr, + descriptor->gfx9.interface_descriptor_data, + GENX(INTERFACE_DESCRIPTOR_DATA_length)); + + uint32_t idd_offset = + ANV_DYNAMIC_VISIBLE_HEAP_OFFSET + ((uint64_t)idd_ptr) & 0xffffffff; + + struct GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD) mdd = { + GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD_header), + .InterfaceDescriptorTotalLength = GENX(INTERFACE_DESCRIPTOR_DATA_length) * 4, + .InterfaceDescriptorDataStartAddress = idd_offset, + }; + GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD_pack)(cmd_ptr, &mdd); + cmd_ptr += GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD_length) * 4; + } + + /* Push constant offset relative to the dynamic state heap */ + uint32_t dyn_push_data_offset = + ANV_DYNAMIC_VISIBLE_HEAP_OFFSET + (((uint64_t)data_ptr) & 0xffffffff); + + struct GENX(MEDIA_CURBE_LOAD) mdl = { + GENX(MEDIA_CURBE_LOAD_header), + .CURBETotalDataLength = descriptor->gfx9.cross_thread_push_size + + descriptor->gfx9.n_threads * + descriptor->gfx9.per_thread_push_size, + .CURBEDataStartAddress = dyn_push_data_offset, + }; + GENX(MEDIA_CURBE_LOAD_pack)(cmd_ptr, &mdl); + cmd_ptr += GENX(MEDIA_CURBE_LOAD_length) * 4; + + /* Emit the walker */ + struct GENX(GPGPU_WALKER) walker = { + .PredicateEnable = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0, + .SIMDSize = descriptor->simd_size / 16, + .ThreadWidthCounterMaximum = descriptor->threads - 1, + .RightExecutionMask = descriptor->right_mask, + .BottomExecutionMask = 0xffffffff, + .ThreadGroupIDXDimension = info->x, + .ThreadGroupIDYDimension = info->y, + .ThreadGroupIDZDimension = info->z, + }; + GENX(GPGPU_WALKER_repack)(cmd_ptr, descriptor->gfx9.gpgpu_walker, &walker); + global uint32_t *walker_ptr = cmd_ptr; + cmd_ptr += GENX(GPGPU_WALKER_length) * 4; + + uint32_t per_thread_push_size = descriptor->gfx9.per_thread_push_size; + if (per_thread_push_size > 0) { + uint32_t cross_thread_push_size = descriptor->gfx9.cross_thread_push_size; + global void *per_thread_ptr0 = data_ptr + cross_thread_push_size; + global void *per_thread_ptr = per_thread_ptr0; + for (uint32_t t = 0; t < descriptor->gfx9.n_threads; t++) { + if (t > 0) { + genX(copy_data)(per_thread_ptr, per_thread_ptr0, + per_thread_push_size); + } + *(uint32_t*)(per_thread_ptr + descriptor->gfx9.subgroup_id_offset) = t; + per_thread_ptr += per_thread_push_size; + } + } + + struct GENX(MEDIA_STATE_FLUSH) flush = { + GENX(MEDIA_STATE_FLUSH_header), + }; + GENX(MEDIA_STATE_FLUSH_pack)(cmd_ptr, &flush); +} +#endif + +void +genX(libanv_preprocess_cs_generate)(global void *cmd_base, + uint32_t cmd_stride, + global void *data_base, + uint32_t data_stride, + global void *seq_base, + uint32_t seq_stride, + global uint32_t *seq_count, + uint32_t max_seq_count, + uint32_t cmd_prolog_size, + uint32_t data_prolog_size, + global struct anv_dgc_cs_layout *layout, + global struct anv_dgc_cs_descriptor *indirect_set, + global void *interface_descriptor_data_ptr, + global void *const_ptr, + uint32_t const_size, + global void *driver_const_ptr, + uint64_t return_addr, + uint32_t flags, + uint32_t seq_idx) +{ + uint32_t max_count = seq_count != 0 ? min(*seq_count, max_seq_count) : max_seq_count; + + if (seq_idx == 0) { + write_prolog_epilog(cmd_base, cmd_stride, max_count, + cmd_prolog_size, seq_idx, return_addr); + } + + if (seq_idx >= max_count) + return; + + /* Pointer to the application generated data, layed out as described in + * stream_layout. + */ + global void *seq_ptr = seq_base + seq_idx * seq_stride; + + /* Get the shader descriptor. */ + global struct anv_dgc_cs_descriptor *descriptor; + if (layout->indirect_set.active != 0) { + uint32_t set_idx = *(global uint32_t *)(seq_ptr + layout->indirect_set.seq_offset); + descriptor = &indirect_set[set_idx]; + } else { + descriptor = indirect_set; + } + + /* Prepare the push constant data. */ + uint32_t push_data_offset = descriptor->push_data_offset; + + /* */ + global void *push_data_ptr = + get_ptr(data_base, data_stride, data_prolog_size, seq_idx) + + layout->push_constants.data_offset; +#if GFX_VERx10 >= 125 + write_app_push_constant_data( + push_data_ptr, &layout->push_constants, + seq_ptr, const_ptr, const_size, seq_idx); + write_cs_drv_push_constant_data( + push_data_ptr, driver_const_ptr, + MAX_PUSH_CONSTANTS_SIZE, + ANV_DRIVER_PUSH_CONSTANTS_SIZE, + seq_ptr + layout->dispatch.seq_offset); +#else + write_app_push_constant_data( + push_data_ptr, &layout->push_constants, + seq_ptr, const_ptr, const_size, seq_idx); + write_cs_drv_push_constant_data( + push_data_ptr - descriptor->push_data_offset, driver_const_ptr, + MAX2(descriptor->push_data_offset, MAX_PUSH_CONSTANTS_SIZE), + MIN2(ANV_DRIVER_PUSH_CONSTANTS_SIZE, + (MAX_PUSH_CONSTANTS_SIZE + ANV_DRIVER_PUSH_CONSTANTS_SIZE) - + descriptor->push_data_offset), + seq_ptr + layout->dispatch.seq_offset); +#endif + + /* Finally write the commands */ + emit_dispatch_commands(cmd_base, cmd_stride, seq_idx, cmd_prolog_size, + push_data_ptr, layout, descriptor, + interface_descriptor_data_ptr, flags, + seq_ptr + layout->dispatch.seq_offset); +} + +void +genX(libanv_postprocess_cs_generate)(global void *cmd_base, + uint32_t cmd_stride, + global void *data_base, + uint32_t data_stride, + global uint32_t *seq_count, + uint32_t max_seq_count, + uint32_t cmd_prolog_size, + uint32_t data_prolog_size, + uint32_t data_idd_offset, + global struct anv_dgc_cs_descriptor *descriptor, + uint64_t return_addr, + uint32_t seq_idx) +{ + uint32_t max_count = seq_count != 0 ? min(*seq_count, max_seq_count) : max_seq_count; + + if (seq_idx == 0) { + write_prolog_epilog(cmd_base, cmd_stride, max_count, + cmd_prolog_size, seq_idx, return_addr); + } + + if (seq_idx >= max_count) + return; + + /* Where to write the commands */ + global void *cmd_ptr = + get_ptr(cmd_base, cmd_stride, cmd_prolog_size, seq_idx); + + /* OR the driver INTERFACE_DESCRIPTOR_DATA dwords with the device generated + * ones. + */ + uint32_t n_dwords = 2; /* dwords covered from + * INTERFACE_DESCRIPTOR_DATA::SamplerCount to + * INTERFACE_DESCRIPTOR_DATA::BindingTablePointer + */ + +#if GFX_VERx10 >= 125 + uint32_t idd_offset_B = 12 /* offset in INTERFACE_DESCRIPTOR_DATA */; + uint32_t csw_body_offset_B = (GFX_VERx10 >= 200 ? 72 : 68) /* offset in COMPUTE_WALKER_BODY */; + uint32_t csw_offset_B = 4 /* offset in COMPUTE_WALKER */; + uint32_t inst_offset_B = csw_offset_B + csw_body_offset_B + idd_offset_B; + merge_dwords(cmd_ptr + inst_offset_B, + cmd_ptr + inst_offset_B, + &descriptor->gfx125.compute_walker[inst_offset_B / 4], + n_dwords); +#else + global void *idd_ptr = + get_ptr(data_base, data_stride, data_prolog_size, seq_idx) + + data_idd_offset; + uint32_t inst_offset_B = 12 /* offset in INTERFACE_DESCRIPTOR_DATA */; + merge_dwords(idd_ptr + inst_offset_B, + idd_ptr + inst_offset_B, + &descriptor->gfx9.interface_descriptor_data[inst_offset_B / 4], + n_dwords); +#endif +} + +#if GFX_VERx10 >= 125 +static uint3 +calc_local_trace_size(uint3 global_size) +{ + unsigned total_shift = 0; + uint3 local_shift = (uint3)(0, 0, 0); + + bool progress; + do { + progress = false; + for (unsigned i = 0; i < 3; i++) { + if ((1 << local_shift[i]) < global_size[i]) { + progress = true; + local_shift[i]++; + total_shift++; + } + + if (total_shift == 3) + return local_shift; + } + } while (progress); + + /* Assign whatever's left to x */ + local_shift[0] += 3 - total_shift; + + return local_shift; +} + +void +genX(libanv_preprocess_rt_generate)(global void *cmd_base, + uint32_t cmd_stride, + global void *data_base, + uint32_t data_stride, + global void *seq_base, + uint32_t seq_stride, + global uint32_t *seq_count, + uint32_t max_seq_count, + uint32_t cmd_prolog_size, + uint32_t data_prolog_size, + global struct anv_dgc_cs_layout *layout, + global void *compute_walker_template, + global void *rtdg_global_template, + global void *const_ptr, + uint32_t const_size, + global void *driver_const_ptr, + uint64_t return_addr, + uint32_t flags, + uint32_t seq_idx) +{ + uint32_t max_count = seq_count != 0 ? min(*seq_count, max_seq_count) : max_seq_count; + + if (seq_idx == 0) { + write_prolog_epilog(cmd_base, cmd_stride, max_count, + cmd_prolog_size, seq_idx, return_addr); + } + + if (seq_idx >= max_count) + return; + + /* Where to write the commands */ + global void *cmd_ptr = + get_ptr(cmd_base, cmd_stride, cmd_prolog_size, seq_idx); + + /* Pointer to the application generated data, layed out as described in + * stream_layout. + */ + global void *seq_ptr = seq_base + seq_idx * seq_stride; + + VkTraceRaysIndirectCommand2KHR *info = + ((global VkTraceRaysIndirectCommand2KHR *)(seq_ptr + layout->dispatch.seq_offset)); + uint3 launch_size = (uint3)(info->width, info->height, info->depth); + + /* RTDG + push constants */ + global void *push_data_ptr = + get_ptr(data_base, data_stride, data_prolog_size, seq_idx) + + layout->push_constants.data_offset; + global void *rtdg_ptr = push_data_ptr; + struct GENX(RT_DISPATCH_GLOBALS) rtdg = { + .LaunchWidth = launch_size.x, + .LaunchHeight = launch_size.y, + .LaunchDepth = launch_size.z, +#if GFX_VER >= 30 + .HitGroupStride = info->hitShaderBindingTableStride, + .HitGroupTable = info->hitShaderBindingTableAddress, + .MissGroupTable = info->missShaderBindingTableAddress, + .MissGroupStride = info->missShaderBindingTableStride, + .CallableGroupTable = info->callableShaderBindingTableAddress, + .CallableGroupStride = info->callableShaderBindingTableStride, +#else + .HitGroupTable = (struct GENX(RT_SHADER_TABLE)) { + .BaseAddress = info->hitShaderBindingTableAddress, + .Stride = info->hitShaderBindingTableStride, + }, + .MissGroupTable = (struct GENX(RT_SHADER_TABLE)) { + .BaseAddress = info->missShaderBindingTableAddress, + .Stride = info->missShaderBindingTableStride, + }, + .CallableGroupTable = (struct GENX(RT_SHADER_TABLE)) { + .BaseAddress = info->callableShaderBindingTableAddress, + .Stride = info->callableShaderBindingTableStride, + }, +#endif + }; + GENX(RT_DISPATCH_GLOBALS_repack)(rtdg_ptr, rtdg_global_template, &rtdg); + + write_app_push_constant_data( + push_data_ptr + ANV_DGC_RT_GLOBAL_DISPATCH_SIZE, + &layout->push_constants, + seq_ptr, const_ptr, const_size, seq_idx); + write_rt_drv_push_constant_data( + push_data_ptr + + ANV_DGC_RT_GLOBAL_DISPATCH_SIZE + + MAX_PUSH_CONSTANTS_SIZE, + driver_const_ptr, + ANV_DRIVER_PUSH_CONSTANTS_SIZE); + + uint3 local_size_log2 = calc_local_trace_size(launch_size); + uint3 one = 1; + uint3 local_size = one << local_size_log2; + uint3 global_size = DIV_ROUND_UP(launch_size, local_size); + + /* Finally write the commands */ + global uint64_t *sbt = (global uint64_t *)info->raygenShaderRecordAddress; + struct GENX(COMPUTE_WALKER) v = { + .PredicateEnable = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0, + .body = { + .LocalXMaximum = (1u << local_size_log2.x) - 1, + .LocalYMaximum = (1u << local_size_log2.y) - 1, + .LocalZMaximum = (1u << local_size_log2.z) - 1, + .ThreadGroupIDXDimension = global_size.x, + .ThreadGroupIDYDimension = global_size.y, + .ThreadGroupIDZDimension = global_size.z, + /* See struct brw_rt_raygen_trampoline_params */ + .InlineData = { + ((uint64_t) rtdg_ptr) & 0xffffffff, + ((uint64_t) rtdg_ptr) >> 32, + info->raygenShaderRecordAddress & 0xffffffff, + info->raygenShaderRecordAddress >> 32, + local_size_log2.x << 8 | + local_size_log2.y << 16 | + local_size_log2.z << 24, + }, + }, + }; + GENX(COMPUTE_WALKER_repack)(cmd_ptr, compute_walker_template, &v); +} +#endif /* GFX_VERx10 >= 125 */ + +#endif /* GFX_VER >= 11 */ diff --git a/src/intel/shaders/generate.cl b/src/intel/shaders/generate.cl index 94b6fea903d..c1c26d7eb06 100644 --- a/src/intel/shaders/generate.cl +++ b/src/intel/shaders/generate.cl @@ -4,6 +4,31 @@ #include "libintel_shaders.h" +void genX(write_address)(global void *dst_ptr, global void *address, uint64_t value) +{ + struct GENX(MI_STORE_DATA_IMM) v = { + GENX(MI_STORE_DATA_IMM_header), + .DWordLength = GENX(MI_STORE_DATA_IMM_length) - + GENX(MI_STORE_DATA_IMM_length_bias) + 1, +#if GFX_VER >= 12 + .ForceWriteCompletionCheck = true, +#endif + .Address = (uint64_t)address, + .ImmediateData = value, + }; + GENX(MI_STORE_DATA_IMM_pack)(dst_ptr, &v); +} + +void genX(write_3DSTATE_VF_TOPOLOGY)(global void *dst_ptr, + uint32_t topology) +{ + struct GENX(3DSTATE_VF_TOPOLOGY) v = { + GENX(3DSTATE_VF_TOPOLOGY_header), + .PrimitiveTopologyType = topology, + }; + GENX(3DSTATE_VF_TOPOLOGY_pack)(dst_ptr, &v); +} + void genX(write_3DSTATE_VERTEX_BUFFERS)(global void *dst_ptr, uint32_t buffer_count) { @@ -38,6 +63,25 @@ void genX(write_VERTEX_BUFFER_STATE)(global void *dst_ptr, GENX(VERTEX_BUFFER_STATE_pack)(dst_ptr, &v); } +void genX(write_3DSTATE_INDEX_BUFFER)(global void *dst_ptr, + uint64_t buffer_addr, + uint32_t buffer_size, + uint32_t index_format, + uint32_t mocs) +{ + struct GENX(3DSTATE_INDEX_BUFFER) v = { + GENX(3DSTATE_INDEX_BUFFER_header), + .MOCS = mocs, + .IndexFormat = index_format, +#if GFX_VER >= 12 + .L3BypassDisable = true, +#endif + .BufferStartingAddress = buffer_addr, + .BufferSize = buffer_size, + }; + GENX(3DSTATE_INDEX_BUFFER_pack)(dst_ptr, &v); +} + void genX(write_3DPRIMITIVE)(global void *dst_ptr, bool is_predicated, bool is_indexed, @@ -202,3 +246,25 @@ void genX(write_draw)(global uint32_t *dst_ptr, } #endif } + +#if GFX_VERx10 >= 125 +void genX(write_3DMESH_3D)(global uint32_t *dst_ptr, + global void *indirect_ptr, + bool is_predicated, + bool uses_tbimr) +{ + VkDrawMeshTasksIndirectCommandEXT data = + *((global VkDrawMeshTasksIndirectCommandEXT *)indirect_ptr); + + struct GENX(3DMESH_3D) v = { + GENX(3DMESH_3D_header), + .TBIMREnabled = uses_tbimr, + .PredicateEnable = is_predicated, + .ThreadGroupCountX = data.groupCountX, + .ThreadGroupCountY = data.groupCountY, + .ThreadGroupCountZ = data.groupCountZ, + + }; + GENX(3DMESH_3D_pack)(dst_ptr, &v); +} +#endif diff --git a/src/intel/shaders/libintel_shaders.h b/src/intel/shaders/libintel_shaders.h index df861eeddea..3cd89a79b34 100644 --- a/src/intel/shaders/libintel_shaders.h +++ b/src/intel/shaders/libintel_shaders.h @@ -13,17 +13,22 @@ #include "util/macros.h" -#else +#include "compiler/intel_shader_enums.h" -#define _MESA_LIBCL_ASSERT_IGNORE 1 +#else #include "libcl_vk.h" #include "genxml/gen_macros.h" #include "genxml/genX_cl_pack.h" +#include "genxml/genX_rt_cl_pack.h" -#define PRAGMA_POISON(param) +#include "compiler/intel_shader_enums.h" + +#define _3DPRIM_PATCHLIST(n) (0x20 + (n - 1)) #endif +#define ANV_GENERATED_MAX_VES (29) + /** * Flags for generated_draws.cl */ @@ -46,6 +51,10 @@ enum anv_generated_draw_flags { ANV_GENERATED_FLAG_WA_16011107343 = BITFIELD_BIT(7), /* Wa_22018402687 */ ANV_GENERATED_FLAG_WA_22018402687 = BITFIELD_BIT(8), + /* Wa_16014912113 */ + ANV_GENERATED_FLAG_WA_16014912113 = BITFIELD_BIT(9), + /* Wa_18022330953 / Wa_22011440098 */ + ANV_GENERATED_FLAG_WA_18022330953 = BITFIELD_BIT(10) }; /** @@ -58,6 +67,9 @@ enum anv_generated_draw_flags { #ifdef __OPENCL_VERSION__ +void genX(write_address)(global void *dst_ptr, + global void *address, uint64_t value); + void genX(write_3DSTATE_VERTEX_BUFFERS)(global void *dst_ptr, uint32_t buffer_count); @@ -68,6 +80,15 @@ void genX(write_VERTEX_BUFFER_STATE)(global void *dst_ptr, uint32_t size, uint32_t stride); +void genX(write_3DSTATE_INDEX_BUFFER)(global void *dst_ptr, + uint64_t buffer_addr, + uint32_t buffer_size, + uint32_t index_format, + uint32_t mocs); + +void genX(write_3DSTATE_VF_TOPOLOGY)(global void *dst_ptr, + uint32_t topology); + void genX(write_3DPRIMITIVE)(global void *dst_ptr, bool is_predicated, bool is_indexed, @@ -93,6 +114,13 @@ void genX(write_3DPRIMITIVE_EXTENDED)(global void *dst_ptr, uint32_t param_draw_id); #endif +#if GFX_VERx10 >= 125 +void genX(write_3DMESH_3D)(global uint32_t *dst_ptr, + global void *indirect_ptr, + bool is_predicated, + bool uses_tbimr); +#endif + void genX(write_MI_BATCH_BUFFER_START)(global void *dst_ptr, uint64_t addr); void genX(write_draw)(global uint32_t *dst_ptr, @@ -112,6 +140,10 @@ void genX(copy_data)(global void *dst_ptr, global void *src_ptr, uint32_t size); +void genX(set_data)(global void *dst_ptr, + uint32_t data, + uint32_t size); + #endif /* __OPENCL_VERSION__ */ #endif /* _LIBANV_SHADERS_H_ */ diff --git a/src/intel/shaders/meson.build b/src/intel/shaders/meson.build index dc72e50c1c9..041d68af612 100644 --- a/src/intel/shaders/meson.build +++ b/src/intel/shaders/meson.build @@ -20,6 +20,7 @@ endif intel_shader_files = files( 'libintel_shaders.h', + 'dgc.cl', 'generate.cl', 'generate_draws.cl', 'generate_draws_iris.cl', @@ -45,6 +46,7 @@ foreach gen : intel_shaders_gens command : [ prog_mesa_clc, intel_shader_files, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@', '--', + '-Wno-initializer-overrides', '-DGFX_VERx10=@0@'.format(gen[0]), '-I' + join_paths(meson.current_source_dir(), '.'), '-I' + join_paths(dir_source_root, 'src/compiler/libcl'), diff --git a/src/intel/vulkan/anv_types.h b/src/intel/vulkan/anv_types.h index 023ad04a25d..cd8437df81a 100644 --- a/src/intel/vulkan/anv_types.h +++ b/src/intel/vulkan/anv_types.h @@ -194,5 +194,267 @@ struct anv_push_constants { }; }; +#define ANV_DRIVER_PUSH_CONSTANTS_SIZE (sizeof(struct anv_push_constants) - MAX_PUSH_CONSTANTS_SIZE) + #define ANV_INLINE_DWORD_PUSH_ADDRESS_LDW (UINT8_MAX - 0) #define ANV_INLINE_DWORD_PUSH_ADDRESS_UDW (UINT8_MAX - 1) + +/* Location of the user visible part of the dynamic state heap (1GiB) */ +#define ANV_DYNAMIC_VISIBLE_HEAP_OFFSET (1024 * 1024 * 1024) + +/** + * Stage enum for generated commands + */ +enum anv_dgc_stage { + ANV_DGC_STAGE_VERTEX = 0, + ANV_DGC_STAGE_TESS_CTRL, + ANV_DGC_STAGE_TESS_EVAL, + ANV_DGC_STAGE_GEOMETRY, + ANV_DGC_STAGE_FRAGMENT, + ANV_DGC_STAGE_TASK, + ANV_DGC_STAGE_MESH, + + ANV_DGC_STAGE_COMPUTE, + ANV_DGC_STAGE_RT, + + ANV_DGC_STAGES, +}; + +#define ANV_DGC_N_GFX_STAGES (ANV_DGC_STAGE_MESH + 1) + +enum anv_dgc_draw_type { + ANV_DGC_DRAW_TYPE_SEQUENTIAL, + ANV_DGC_DRAW_TYPE_INDEXED, + ANV_DGC_DRAW_TYPE_MESH, +}; + +#define ANV_DGC_RT_GLOBAL_DISPATCH_SIZE (128) + +enum anv_dgc_push_constant_flags { + ANV_DGC_PUSH_CONSTANTS_CMD_ACTIVE = BITFIELD_BIT(0), +}; + +/** + * This structure represents the indirect data layout (in + * VkGeneratedCommandsInfoEXT::indirectAddress) for push constants + */ +struct anv_dgc_push_layout { + struct anv_dgc_push_entry { + /* Location of the data to copy in the indirect buffer */ + uint32_t seq_offset; + + /* Location where to write the data in anv_push_constants::client_data[] + */ + uint16_t push_offset; + + /* Size of the data to copy */ + uint16_t size; + } entries[32]; + + uint8_t flags; /* enum anv_dgc_push_constant_flags */ + + uint8_t num_entries; + uint8_t mocs; + + /* Whether the sequence ID is active and at what offset we should write it + * in the push constant data + */ + uint16_t seq_id_active; + uint16_t seq_id_offset; + + /* Offset of the push constant commands in the preprocessed buffer. + */ + uint16_t cmd_offset; + uint16_t cmd_size; + + /* Offset of the data in the indirect buffer, relative to + * VkGeneratedCommandsInfoEXT::indirectAddress + */ + uint16_t data_offset; +}; + +/** + * This structure represents both the data layout (in + * VkGeneratedCommandsInfoEXT::indirectAddress) and the command layout in the + * preprocess buffer (in VkGeneratedCommandsInfoEXT::preprocessAddress) for + * graphics commands + */ +struct anv_dgc_gfx_layout { + struct anv_dgc_index_buffer { + uint16_t cmd_offset; /* Offset of 3DSTATE_INDEX_BUFFER */ + uint16_t cmd_size; + uint16_t seq_offset; /* Offset of VkBindIndexBufferIndirectCommandEXT */ + uint16_t mocs; + uint32_t u32_value; + uint32_t u16_value; + uint32_t u8_value; + } index_buffer; + + struct { + struct anv_dgc_vertex_buffer { + uint16_t seq_offset; /* Offset of VkBindVertexBufferIndirectCommandEXT */ + uint16_t binding; + } buffers[31]; + uint16_t n_buffers; + uint16_t mocs; + uint16_t cmd_offset; /* Offset of 3DSTATE_VERTEX_BUFFERS */ + uint16_t cmd_size; + } vertex_buffers; + + struct anv_dgc_push_layout push_constants; + + struct { + uint16_t final_cmds_offset; + uint16_t final_cmds_size; + uint32_t active; + } indirect_set; + + struct { + uint16_t cmd_offset; /* Offset of 3DPRIMITIVE/3DMESH_3D */ + uint16_t cmd_size; + uint16_t draw_type; /* anv_dgc_gfx_draw_type */ + uint16_t seq_offset; /* Offset of : + * - VkDrawIndirectCommand + * - VkDrawIndexedIndirectCommand + * - VkDrawMeshTasksIndirectCommandEXT + */ + } draw; +}; + +/** + * This structure represents both the data layout (in + * VkGeneratedCommandsInfoEXT::indirectAddress) and the command layout in the + * preprocess buffer (in VkGeneratedCommandsInfoEXT::preprocessAddress) for + * compute commands + */ +struct anv_dgc_cs_layout { + struct anv_dgc_push_layout push_constants; + + /* Location of the indirect execution set index */ + struct { + uint32_t seq_offset; + uint16_t data_offset; + uint16_t active; + } indirect_set; + + /* Offset of VkDispatchIndirectCommand */ + struct { + uint32_t seq_offset; + uint16_t cmd_offset; + uint16_t pad; + } dispatch; +}; + +enum anv_dgc_push_slot_type { + ANV_DGC_PUSH_SLOT_TYPE_PUSH_CONSTANTS, + ANV_DGC_PUSH_SLOT_TYPE_OTHER, +}; + +/** + * This structure holds prepacked HW instructions for a set of graphics + * shaders forming a pipeline . It is part of the command buffer temporary + * memory. + */ +struct anv_dgc_gfx_descriptor { + /* Fully packed instructions ready to be copied directly into the + * preprocess buffer (for workarounds) + */ + uint32_t final_commands[20]; + uint32_t final_commands_size; + + uint32_t wa_18019110168_remapping_table_offset; + + struct { + struct anv_dgc_push_stage_state { + union { + struct { + struct anv_dgc_push_stage_slot { + uint16_t push_data_offset; + uint16_t push_data_size; + uint32_t type; /* enum anv_dgc_push_slot_type */ + } slots[4]; + uint32_t n_slots; + } legacy; + struct anv_dgc_push_bindless_stage { + uint16_t push_data_offset; + uint16_t inline_dwords_count; + uint8_t inline_dwords[8]; + } bindless; + }; + } stages[ANV_DGC_N_GFX_STAGES]; + uint32_t active_stages; /* Bitfield of anv_dgc_command_stage */ + } push_constants; +}; + +/** + * This structure holds information about the graphics state for generation. + */ +struct anv_dgc_gfx_state { + struct anv_dgc_gfx_layout layout; + + struct anv_dgc_gfx_descriptor descriptor; + + struct { + uint64_t addresses[4]; + } push_constants; + + struct { + uint16_t instance_multiplier; + uint32_t flags; /* ANV_GENERATED_FLAG_* */ + } draw; +}; + +/** + * This structure holds prepacked HW instructions for a compute shader. It is + * either located in the memory associated with VkIndirectExecutionSetEXT or + * part of the command buffer temporary memory if indirect execution set is + * not used. + */ +struct anv_dgc_cs_descriptor { + union { + struct { + uint32_t compute_walker[40]; + uint32_t inline_dwords_count; + uint8_t inline_dwords[8]; + } gfx125; + + struct { + /* Needs to be the first field because + * MEDIA_INTERFACE_DESCRIPTOR_LOAD::InterfaceDescriptorDataStartAddress + * needs 64B alignment. + */ + uint32_t interface_descriptor_data[8]; + uint32_t gpgpu_walker[15]; + uint32_t media_vfe_state[9]; + + uint32_t n_threads; + uint16_t cross_thread_push_size; + uint8_t per_thread_push_size; + uint8_t subgroup_id_offset; + } gfx9; + }; + + uint32_t right_mask; + uint32_t threads; + uint32_t simd_size; + + uint32_t push_data_offset; + + /* Align the struct to 64B */ + uint32_t pad[1]; +}; + +/** + * This structure holds information for a ray tracing pipeline. + */ +struct anv_dgc_rt_indirect_descriptor { + uint32_t ray_stack_stride; + uint32_t stack_ids_per_dss; + uint32_t sw_stack_size; + + uint64_t call_handler; + + uint64_t hit_sbt; + uint64_t miss_sbt; + uint64_t callable_sbt; +};