From fee5106b53f8b331dce47546fec9b4d2393d22cd Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 27 Apr 2026 01:05:17 +0300 Subject: [PATCH] anv: add Gfx9 support VK_EXT_device_generated_commands This platform just needs a bit more care around vertex buffer state emission. Signed-off-by: Lionel Landwerlin Acked-by: Alyssa Rosenzweig Part-of: --- src/intel/genxml/meson.build | 1 + src/intel/shaders/dgc.cl | 81 +++++++++++++++++++++--- src/intel/shaders/util.cl | 20 ++++++ src/intel/vulkan/anv_dgc_layout.c | 40 +++++++++++- src/intel/vulkan/anv_dgc_set.c | 9 +++ src/intel/vulkan/anv_physical_device.c | 3 +- src/intel/vulkan/anv_private.h | 2 - src/intel/vulkan/anv_types.h | 14 +++- src/intel/vulkan/genX_cmd_dgc.c | 15 +++++ src/intel/vulkan/genX_internal_kernels.c | 2 - 10 files changed, 168 insertions(+), 19 deletions(-) diff --git a/src/intel/genxml/meson.build b/src/intel/genxml/meson.build index 87006594bf8..893c822fa9b 100644 --- a/src/intel/genxml/meson.build +++ b/src/intel/genxml/meson.build @@ -230,6 +230,7 @@ genX_cl_included_symbols = [ 'MI_ARB_CHECK', 'MI_BATCH_BUFFER_START', 'MI_STORE_DATA_IMM', + 'PIPE_CONTROL', # structures '3DSTATE_CONSTANT_ALL_DATA', '3DSTATE_CONSTANT_BODY', diff --git a/src/intel/shaders/dgc.cl b/src/intel/shaders/dgc.cl index f288a01c737..ceb43d1fc3e 100644 --- a/src/intel/shaders/dgc.cl +++ b/src/intel/shaders/dgc.cl @@ -11,7 +11,7 @@ (((descriptor)->active_stages & \ BITFIELD_BIT(ANV_DGC_STAGE_##stage)) != 0) -#if GFX_VER >= 11 +#if GFX_VER >= 9 static void merge_dwords(global void *dst, global void *src1, global void *src2, uint32_t n_dwords) @@ -434,19 +434,35 @@ genX(libanv_preprocess_gfx_generate)(global void *cmd_base, /* 3DSTATE_VERTEX_BUFFERS */ uint32_t n_vertex_buffers = state->layout.vertex_buffers.n_buffers; - if (n_vertex_buffers) { + uint32_t n_draw_param_buffers = GFX_VER == 9 ? util_bitcount(state->descriptor.draw_params) : 0; + if (n_vertex_buffers > 0 || n_draw_param_buffers > 0) { global void *cmd_vb = cmd_ptr + state->layout.vertex_buffers.cmd_offset; - genX(write_3DSTATE_VERTEX_BUFFERS)(cmd_vb, n_vertex_buffers); + genX(write_3DSTATE_VERTEX_BUFFERS)(cmd_vb, n_vertex_buffers + n_draw_param_buffers); cmd_vb += 4; +#if GFX_VER == 9 + global void *prev_seq_ptr = seq_base + (seq_idx == 0 ? 0 : (seq_idx - 1)) * seq_stride; + bool needs_vf_inval = false; +#endif + uint16_t mocs = state->layout.vertex_buffers.mocs; for (uint32_t i = 0; i < n_vertex_buffers; i++) { struct anv_dgc_vertex_buffer vb = state->layout.vertex_buffers.buffers[i]; - VkBindVertexBufferIndirectCommandEXT vtx_data = *(global VkBindVertexBufferIndirectCommandEXT *)( seq_ptr + vb.seq_offset); +#if GFX_VER == 9 + VkBindVertexBufferIndirectCommandEXT prev_vtx_data = + *(global VkBindVertexBufferIndirectCommandEXT *)( + prev_seq_ptr + vb.seq_offset); + if ((vtx_data.bufferAddress >> 32) != (prev_vtx_data.bufferAddress >> 32)) { + uint32_t offset = vtx_data.bufferAddress & 0xffffffff; + uint32_t prev_offset = prev_vtx_data.bufferAddress & 0xffffffff; + if (offset >= prev_offset && offset < (prev_offset + prev_vtx_data.size)) + needs_vf_inval = true; + } +#endif genX(write_VERTEX_BUFFER_STATE)(cmd_vb, mocs, vb.binding, vtx_data.bufferAddress, @@ -454,6 +470,53 @@ genX(libanv_preprocess_gfx_generate)(global void *cmd_base, vtx_data.stride); cmd_vb += GENX(VERTEX_BUFFER_STATE_length) * 4; } + +#if GFX_VER == 9 + global uint32_t *draw_param_ptr = + get_ptr(data_base, data_stride, data_prolog_size, seq_idx) + + state->layout.push_constants.data_offset + + MAX_PUSH_CONSTANTS_SIZE + + ANV_DRIVER_PUSH_CONSTANTS_SIZE; + + if (state->descriptor.draw_params & ANV_DGC_DRAW_PARAM_BASE_INSTANCE_VERTEX) { + genX(write_VERTEX_BUFFER_STATE)(cmd_vb, mocs, ANV_SVGS_VB_INDEX, + (uint64_t)draw_param_ptr, 8, 0); + cmd_vb += GENX(VERTEX_BUFFER_STATE_length) * 4; + if (state->layout.draw.draw_type == ANV_DGC_DRAW_TYPE_SEQUENTIAL) { + VkDrawIndirectCommand data = + *((global VkDrawIndirectCommand *)(seq_ptr + state->layout.draw.seq_offset)); + draw_param_ptr[0] = data.firstVertex; + draw_param_ptr[1] = data.firstInstance; + } else { + VkDrawIndexedIndirectCommand data = + *((global VkDrawIndexedIndirectCommand *)(seq_ptr + state->layout.draw.seq_offset)); + draw_param_ptr[0] = data.vertexOffset; + draw_param_ptr[1] = data.firstInstance; + } + draw_param_ptr += 2; + } + if (state->descriptor.draw_params & ANV_DGC_DRAW_PARAM_DRAW_ID) { + genX(write_VERTEX_BUFFER_STATE)(cmd_vb, mocs, ANV_DRAWID_VB_INDEX, + (uint64_t)draw_param_ptr, 4, 0); + cmd_vb += GENX(VERTEX_BUFFER_STATE_length) * 4; + /* gl_DrawID is always 0 since we don't support + * VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_COUNT_EXT + */ + draw_param_ptr[0] = 0; + draw_param_ptr += 1; + } + + if (needs_vf_inval) { + struct GENX(PIPE_CONTROL) pc = { + .CommandStreamerStallEnable = true, + .VFCacheInvalidationEnable = true, + }; + GENX(PIPE_CONTROL_pack)(cmd_vb, &pc); + } else { + genX(set_data)(cmd_vb, GENX(PIPE_CONTROL_length) * 4, 0); + } + cmd_vb += GENX(PIPE_CONTROL_length) * 4; +#endif } #if INTEL_WA_16011107343_GFX_VER || INTEL_WA_22018402687_GFX_VER @@ -497,8 +560,8 @@ genX(libanv_preprocess_gfx_generate)(global void *cmd_base, false /* indexed */, is_predicated, tbimr_enabled, - true /* uses_base, unused for Gfx11+ */, - true /* uses_draw_id, unused for Gfx11+ */, + false /* uses_base, unused for Gfx11+ */, + false /* uses_draw_id, unused for Gfx11+ */, 0 /* mocs, unused for Gfx11+ */); break; @@ -511,8 +574,8 @@ genX(libanv_preprocess_gfx_generate)(global void *cmd_base, true /* indexed */, is_predicated, tbimr_enabled, - true /* uses_base, unused for Gfx11+ */, - true /* uses_draw_id, unused for Gfx11+ */, + false /* uses_base, unused for Gfx11+ */, + false /* uses_draw_id, unused for Gfx11+ */, 0 /* mocs, unused for Gfx11+ */); break; @@ -944,4 +1007,4 @@ genX(libanv_preprocess_rt_generate)(global void *cmd_base, } #endif /* GFX_VERx10 >= 125 */ -#endif /* GFX_VER >= 11 */ +#endif /* GFX_VER >= 9 */ diff --git a/src/intel/shaders/util.cl b/src/intel/shaders/util.cl index ade800a8a3e..2739dc9aa34 100644 --- a/src/intel/shaders/util.cl +++ b/src/intel/shaders/util.cl @@ -43,3 +43,23 @@ void genX(copy_data)(global void *dst_ptr, } } } + +/* Copy size from src_ptr to dst_ptr for using a single lane with size + * multiple of 4. + */ +void genX(set_data)(global void *dst_ptr, + uint32_t size, + uint32_t data) +{ + for (uint32_t offset = 0; offset < size; offset += 16) { + if (offset + 16 <= size) { + *(global uint4 *)(dst_ptr + offset) = (uint4)(data); + } else if (offset + 12 <= size) { + *(global uint3 *)(dst_ptr + offset) = (uint3)(data); + } else if (offset + 8 <= size) { + *(global uint2 *)(dst_ptr + offset) = (uint2)(data); + } else if (offset + 4 <= size) { + *(global uint *)(dst_ptr + offset) = data; + } + } +} diff --git a/src/intel/vulkan/anv_dgc_layout.c b/src/intel/vulkan/anv_dgc_layout.c index 355083271d4..332f745aef4 100644 --- a/src/intel/vulkan/anv_dgc_layout.c +++ b/src/intel/vulkan/anv_dgc_layout.c @@ -203,9 +203,26 @@ VkResult anv_CreateIndirectCommandsLayoutEXT( } /* 3DSTATE_VERTEX_BUFFERS */ - if (vk_layout->dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) { + if (devinfo->ver == 9) { + const uint32_t n_vb_entries = + 2 + util_bitcount(vk_layout->vertex_bindings); layout_add_command(layout_obj, (1 /* TODO: _3DSTATE_VERTEX_BUFFERS_length(devinfo) */ + + /* Number of vertex buffers + draw params (Gfx9 only) */ + n_vb_entries * + VERTEX_BUFFER_STATE_length(devinfo)) * 4, + "vertex"); + if (vk_layout->dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) { + layout_add_command(layout_obj, + PIPE_CONTROL_length(devinfo) * 4, + "vertex cache inval"); + } + /* Draw params data, gl_BaseInstance, gl_BaseVertex, gl_DrawID */ + layout_add_data(layout_obj, 4 * 3, 4, NULL); + } else if (vk_layout->dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) { + layout_add_command(layout_obj, + (1 /* TODO: _3DSTATE_VERTEX_BUFFERS_length(devinfo) */ + + /* Number of vertex buffers */ util_bitcount(vk_layout->vertex_bindings) * VERTEX_BUFFER_STATE_length(devinfo)) * 4, "vertex"); @@ -499,9 +516,26 @@ anv_dgc_fill_gfx_layout(struct anv_dgc_gfx_layout *layout, layout->vertex_buffers.buffers[i].binding = vk_layout->vb_layouts[i].binding; } - - cmd_offset += layout->vertex_buffers.cmd_size; } + if (devinfo->ver == 9) { + const struct brw_vs_prog_data *vs_prog_data = + get_shader_vs_prog_data(shaders[MESA_SHADER_VERTEX]); + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance || + vs_prog_data->uses_drawid) { + layout->vertex_buffers.cmd_size = MAX2( + layout->vertex_buffers.cmd_size, + 4 /* TODO: _3DSTATE_VERTEX_BUFFERS_length(devinfo) */); + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + layout->vertex_buffers.cmd_size += VERTEX_BUFFER_STATE_length(devinfo) * 4; + if (vs_prog_data->uses_drawid) + layout->vertex_buffers.cmd_size += VERTEX_BUFFER_STATE_length(devinfo) * 4; + } + if (vk_layout->dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) + layout->vertex_buffers.cmd_size += PIPE_CONTROL_length(devinfo) * 4; + } + cmd_offset += layout->vertex_buffers.cmd_size; layout->indirect_set.final_cmds_offset = cmd_offset; if (intel_needs_workaround(devinfo, 16011107343) && diff --git a/src/intel/vulkan/anv_dgc_set.c b/src/intel/vulkan/anv_dgc_set.c index 47c942fd4a6..5d662437357 100644 --- a/src/intel/vulkan/anv_dgc_set.c +++ b/src/intel/vulkan/anv_dgc_set.c @@ -68,6 +68,15 @@ anv_write_gfx_indirect_descriptor(struct anv_device *device, } assert(descriptor->final_commands_size <= sizeof(descriptor->final_commands)); + if (device->info->ver == 9) { + const struct brw_vs_prog_data *vs_prog_data = get_gfx_vs_prog_data(gfx); + + descriptor->draw_params = + ((vs_prog_data->uses_firstvertex || vs_prog_data->uses_baseinstance) ? + ANV_DGC_DRAW_PARAM_BASE_INSTANCE_VERTEX : 0) | + (vs_prog_data->uses_drawid ? ANV_DGC_DRAW_PARAM_DRAW_ID : 0); + } + anv_foreach_vk_stage(vk_stage, ANV_GRAPHICS_STAGE_BITS) { enum anv_dgc_stage gen_stage = anv_vk_stage_to_dgc_stage(vk_stage); enum mesa_shader_stage stage = vk_to_mesa_shader_stage(vk_stage); diff --git a/src/intel/vulkan/anv_physical_device.c b/src/intel/vulkan/anv_physical_device.c index 95d39bf6e3b..874e4fb8e80 100644 --- a/src/intel/vulkan/anv_physical_device.c +++ b/src/intel/vulkan/anv_physical_device.c @@ -304,8 +304,7 @@ get_device_extensions(const struct anv_physical_device *device, * buffer approach, at the expense of late preprocessing. But this is * for later. */ - .EXT_device_generated_commands = device->info.verx10 >= 125 || - (device->info.ver >= 11 || ANV_DEBUG(EXPERIMENTAL)), + .EXT_device_generated_commands = device->info.verx10 >= 125 || ANV_DEBUG(EXPERIMENTAL), .EXT_device_memory_report = true, #ifdef VK_USE_PLATFORM_DISPLAY_KHR .EXT_display_control = true, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 4d484b007ca..ec5173dc0c6 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -163,8 +163,6 @@ struct intel_perf_query_result; #define BINDING_TABLE_VIEW_SIZE (1u << 20) #define BINDING_TABLE_POOL_DEFAULT_BLOCK_SIZE (4096) -#define HW_MAX_VBS 33 - /* 3DSTATE_VERTEX_BUFFER supports 33 VBs, but before Gen11 we used 2 * for base & drawid SGVs */ static inline int diff --git a/src/intel/vulkan/anv_types.h b/src/intel/vulkan/anv_types.h index cd8437df81a..3d758bc4cc7 100644 --- a/src/intel/vulkan/anv_types.h +++ b/src/intel/vulkan/anv_types.h @@ -75,6 +75,8 @@ */ #define MAX_BINDING_TABLE_SIZE 240 +#define HW_MAX_VBS 33 + /* 3DSTATE_VERTEX_BUFFER supports 33 VBs, but these limits are applied on Gen9 * graphics, where 2 VBs are reserved for base & drawid SGVs. */ @@ -350,6 +352,11 @@ enum anv_dgc_push_slot_type { ANV_DGC_PUSH_SLOT_TYPE_OTHER, }; +enum anv_dgc_draw_params { + ANV_DGC_DRAW_PARAM_BASE_INSTANCE_VERTEX = BITFIELD_BIT(0), + ANV_DGC_DRAW_PARAM_DRAW_ID = BITFIELD_BIT(1), +}; + /** * This structure holds prepacked HW instructions for a set of graphics * shaders forming a pipeline . It is part of the command buffer temporary @@ -362,7 +369,12 @@ struct anv_dgc_gfx_descriptor { uint32_t final_commands[20]; uint32_t final_commands_size; - uint32_t wa_18019110168_remapping_table_offset; + union { + /* Gfx12.5 only */ + uint32_t wa_18019110168_remapping_table_offset; + /* Gfx9 only */ + enum anv_dgc_draw_params draw_params; + }; struct { struct anv_dgc_push_stage_state { diff --git a/src/intel/vulkan/genX_cmd_dgc.c b/src/intel/vulkan/genX_cmd_dgc.c index cb9ea19af52..6fdd93a3c18 100644 --- a/src/intel/vulkan/genX_cmd_dgc.c +++ b/src/intel/vulkan/genX_cmd_dgc.c @@ -786,6 +786,21 @@ void genX(CmdExecuteGeneratedCommandsEXT)( if (cmd_buffer->state.conditional_render_enabled) genX(cmd_emit_conditional_render_predicate)(cmd_buffer); +#if GFX_VER == 9 + /* Gfx9 has a VF cache issues (only considers the bottom 32bit of the VF + * buffer address), since we're likely to emit those in the DGC buffer, + * invalidate the cache here, further invalidation is emitted in the + * generated commands if needed. + */ + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT_KHR | + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT_KHR, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + ANV_PIPE_VF_CACHE_INVALIDATE_BIT, + "Gfx9 VF cache inval pre dgc exec"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); +#endif + /* If a shader runs, flush the data to make it visible to CS. */ if (params) { anv_add_pending_pipe_bits(cmd_buffer, diff --git a/src/intel/vulkan/genX_internal_kernels.c b/src/intel/vulkan/genX_internal_kernels.c index 93ab74ec7bc..8f5873580b3 100644 --- a/src/intel/vulkan/genX_internal_kernels.c +++ b/src/intel/vulkan/genX_internal_kernels.c @@ -110,7 +110,6 @@ genX(call_internal_shader)(nir_builder *b, enum anv_internal_kernel_name shader_ nir_imul_imm(b, load_compute_index(b), 4)); return sizeof(struct anv_memcpy_params); -#if GFX_VER >= 11 case ANV_INTERNAL_KERNEL_DGC_GFX_COMPUTE: case ANV_INTERNAL_KERNEL_DGC_GFX_FRAGMENT: genX(libanv_preprocess_gfx_generate)( @@ -186,7 +185,6 @@ genX(call_internal_shader)(nir_builder *b, enum anv_internal_kernel_name shader_ load_param(b, 32, struct anv_dgc_dump_params, n_dwords), load_param(b, 64, struct anv_dgc_dump_params, call_addr)); return sizeof(struct anv_dgc_dump_params); -#endif /* GFX_VER >= 11 */ #if GFX_VERx10 >= 125 case ANV_INTERNAL_KERNEL_DGC_RT_COMPUTE: