anv: add Gfx9 support VK_EXT_device_generated_commands

This platform just needs a bit more care around vertex buffer state
emission.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31384>
This commit is contained in:
Lionel Landwerlin 2026-04-27 01:05:17 +03:00 committed by Marge Bot
parent afabf6e350
commit fee5106b53
10 changed files with 168 additions and 19 deletions

View file

@ -230,6 +230,7 @@ genX_cl_included_symbols = [
'MI_ARB_CHECK',
'MI_BATCH_BUFFER_START',
'MI_STORE_DATA_IMM',
'PIPE_CONTROL',
# structures
'3DSTATE_CONSTANT_ALL_DATA',
'3DSTATE_CONSTANT_BODY',

View file

@ -11,7 +11,7 @@
(((descriptor)->active_stages & \
BITFIELD_BIT(ANV_DGC_STAGE_##stage)) != 0)
#if GFX_VER >= 11
#if GFX_VER >= 9
static void
merge_dwords(global void *dst, global void *src1, global void *src2, uint32_t n_dwords)
@ -434,19 +434,35 @@ genX(libanv_preprocess_gfx_generate)(global void *cmd_base,
/* 3DSTATE_VERTEX_BUFFERS */
uint32_t n_vertex_buffers = state->layout.vertex_buffers.n_buffers;
if (n_vertex_buffers) {
uint32_t n_draw_param_buffers = GFX_VER == 9 ? util_bitcount(state->descriptor.draw_params) : 0;
if (n_vertex_buffers > 0 || n_draw_param_buffers > 0) {
global void *cmd_vb = cmd_ptr + state->layout.vertex_buffers.cmd_offset;
genX(write_3DSTATE_VERTEX_BUFFERS)(cmd_vb, n_vertex_buffers);
genX(write_3DSTATE_VERTEX_BUFFERS)(cmd_vb, n_vertex_buffers + n_draw_param_buffers);
cmd_vb += 4;
#if GFX_VER == 9
global void *prev_seq_ptr = seq_base + (seq_idx == 0 ? 0 : (seq_idx - 1)) * seq_stride;
bool needs_vf_inval = false;
#endif
uint16_t mocs = state->layout.vertex_buffers.mocs;
for (uint32_t i = 0; i < n_vertex_buffers; i++) {
struct anv_dgc_vertex_buffer vb = state->layout.vertex_buffers.buffers[i];
VkBindVertexBufferIndirectCommandEXT vtx_data =
*(global VkBindVertexBufferIndirectCommandEXT *)(
seq_ptr + vb.seq_offset);
#if GFX_VER == 9
VkBindVertexBufferIndirectCommandEXT prev_vtx_data =
*(global VkBindVertexBufferIndirectCommandEXT *)(
prev_seq_ptr + vb.seq_offset);
if ((vtx_data.bufferAddress >> 32) != (prev_vtx_data.bufferAddress >> 32)) {
uint32_t offset = vtx_data.bufferAddress & 0xffffffff;
uint32_t prev_offset = prev_vtx_data.bufferAddress & 0xffffffff;
if (offset >= prev_offset && offset < (prev_offset + prev_vtx_data.size))
needs_vf_inval = true;
}
#endif
genX(write_VERTEX_BUFFER_STATE)(cmd_vb, mocs, vb.binding,
vtx_data.bufferAddress,
@ -454,6 +470,53 @@ genX(libanv_preprocess_gfx_generate)(global void *cmd_base,
vtx_data.stride);
cmd_vb += GENX(VERTEX_BUFFER_STATE_length) * 4;
}
#if GFX_VER == 9
global uint32_t *draw_param_ptr =
get_ptr(data_base, data_stride, data_prolog_size, seq_idx) +
state->layout.push_constants.data_offset +
MAX_PUSH_CONSTANTS_SIZE +
ANV_DRIVER_PUSH_CONSTANTS_SIZE;
if (state->descriptor.draw_params & ANV_DGC_DRAW_PARAM_BASE_INSTANCE_VERTEX) {
genX(write_VERTEX_BUFFER_STATE)(cmd_vb, mocs, ANV_SVGS_VB_INDEX,
(uint64_t)draw_param_ptr, 8, 0);
cmd_vb += GENX(VERTEX_BUFFER_STATE_length) * 4;
if (state->layout.draw.draw_type == ANV_DGC_DRAW_TYPE_SEQUENTIAL) {
VkDrawIndirectCommand data =
*((global VkDrawIndirectCommand *)(seq_ptr + state->layout.draw.seq_offset));
draw_param_ptr[0] = data.firstVertex;
draw_param_ptr[1] = data.firstInstance;
} else {
VkDrawIndexedIndirectCommand data =
*((global VkDrawIndexedIndirectCommand *)(seq_ptr + state->layout.draw.seq_offset));
draw_param_ptr[0] = data.vertexOffset;
draw_param_ptr[1] = data.firstInstance;
}
draw_param_ptr += 2;
}
if (state->descriptor.draw_params & ANV_DGC_DRAW_PARAM_DRAW_ID) {
genX(write_VERTEX_BUFFER_STATE)(cmd_vb, mocs, ANV_DRAWID_VB_INDEX,
(uint64_t)draw_param_ptr, 4, 0);
cmd_vb += GENX(VERTEX_BUFFER_STATE_length) * 4;
/* gl_DrawID is always 0 since we don't support
* VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_COUNT_EXT
*/
draw_param_ptr[0] = 0;
draw_param_ptr += 1;
}
if (needs_vf_inval) {
struct GENX(PIPE_CONTROL) pc = {
.CommandStreamerStallEnable = true,
.VFCacheInvalidationEnable = true,
};
GENX(PIPE_CONTROL_pack)(cmd_vb, &pc);
} else {
genX(set_data)(cmd_vb, GENX(PIPE_CONTROL_length) * 4, 0);
}
cmd_vb += GENX(PIPE_CONTROL_length) * 4;
#endif
}
#if INTEL_WA_16011107343_GFX_VER || INTEL_WA_22018402687_GFX_VER
@ -497,8 +560,8 @@ genX(libanv_preprocess_gfx_generate)(global void *cmd_base,
false /* indexed */,
is_predicated,
tbimr_enabled,
true /* uses_base, unused for Gfx11+ */,
true /* uses_draw_id, unused for Gfx11+ */,
false /* uses_base, unused for Gfx11+ */,
false /* uses_draw_id, unused for Gfx11+ */,
0 /* mocs, unused for Gfx11+ */);
break;
@ -511,8 +574,8 @@ genX(libanv_preprocess_gfx_generate)(global void *cmd_base,
true /* indexed */,
is_predicated,
tbimr_enabled,
true /* uses_base, unused for Gfx11+ */,
true /* uses_draw_id, unused for Gfx11+ */,
false /* uses_base, unused for Gfx11+ */,
false /* uses_draw_id, unused for Gfx11+ */,
0 /* mocs, unused for Gfx11+ */);
break;
@ -944,4 +1007,4 @@ genX(libanv_preprocess_rt_generate)(global void *cmd_base,
}
#endif /* GFX_VERx10 >= 125 */
#endif /* GFX_VER >= 11 */
#endif /* GFX_VER >= 9 */

View file

@ -43,3 +43,23 @@ void genX(copy_data)(global void *dst_ptr,
}
}
}
/* Copy size from src_ptr to dst_ptr for using a single lane with size
* multiple of 4.
*/
void genX(set_data)(global void *dst_ptr,
uint32_t size,
uint32_t data)
{
for (uint32_t offset = 0; offset < size; offset += 16) {
if (offset + 16 <= size) {
*(global uint4 *)(dst_ptr + offset) = (uint4)(data);
} else if (offset + 12 <= size) {
*(global uint3 *)(dst_ptr + offset) = (uint3)(data);
} else if (offset + 8 <= size) {
*(global uint2 *)(dst_ptr + offset) = (uint2)(data);
} else if (offset + 4 <= size) {
*(global uint *)(dst_ptr + offset) = data;
}
}
}

View file

@ -203,9 +203,26 @@ VkResult anv_CreateIndirectCommandsLayoutEXT(
}
/* 3DSTATE_VERTEX_BUFFERS */
if (vk_layout->dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
if (devinfo->ver == 9) {
const uint32_t n_vb_entries =
2 + util_bitcount(vk_layout->vertex_bindings);
layout_add_command(layout_obj,
(1 /* TODO: _3DSTATE_VERTEX_BUFFERS_length(devinfo) */ +
/* Number of vertex buffers + draw params (Gfx9 only) */
n_vb_entries *
VERTEX_BUFFER_STATE_length(devinfo)) * 4,
"vertex");
if (vk_layout->dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
layout_add_command(layout_obj,
PIPE_CONTROL_length(devinfo) * 4,
"vertex cache inval");
}
/* Draw params data, gl_BaseInstance, gl_BaseVertex, gl_DrawID */
layout_add_data(layout_obj, 4 * 3, 4, NULL);
} else if (vk_layout->dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
layout_add_command(layout_obj,
(1 /* TODO: _3DSTATE_VERTEX_BUFFERS_length(devinfo) */ +
/* Number of vertex buffers */
util_bitcount(vk_layout->vertex_bindings) *
VERTEX_BUFFER_STATE_length(devinfo)) * 4,
"vertex");
@ -499,9 +516,26 @@ anv_dgc_fill_gfx_layout(struct anv_dgc_gfx_layout *layout,
layout->vertex_buffers.buffers[i].binding =
vk_layout->vb_layouts[i].binding;
}
cmd_offset += layout->vertex_buffers.cmd_size;
}
if (devinfo->ver == 9) {
const struct brw_vs_prog_data *vs_prog_data =
get_shader_vs_prog_data(shaders[MESA_SHADER_VERTEX]);
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance ||
vs_prog_data->uses_drawid) {
layout->vertex_buffers.cmd_size = MAX2(
layout->vertex_buffers.cmd_size,
4 /* TODO: _3DSTATE_VERTEX_BUFFERS_length(devinfo) */);
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance)
layout->vertex_buffers.cmd_size += VERTEX_BUFFER_STATE_length(devinfo) * 4;
if (vs_prog_data->uses_drawid)
layout->vertex_buffers.cmd_size += VERTEX_BUFFER_STATE_length(devinfo) * 4;
}
if (vk_layout->dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB))
layout->vertex_buffers.cmd_size += PIPE_CONTROL_length(devinfo) * 4;
}
cmd_offset += layout->vertex_buffers.cmd_size;
layout->indirect_set.final_cmds_offset = cmd_offset;
if (intel_needs_workaround(devinfo, 16011107343) &&

View file

@ -68,6 +68,15 @@ anv_write_gfx_indirect_descriptor(struct anv_device *device,
}
assert(descriptor->final_commands_size <= sizeof(descriptor->final_commands));
if (device->info->ver == 9) {
const struct brw_vs_prog_data *vs_prog_data = get_gfx_vs_prog_data(gfx);
descriptor->draw_params =
((vs_prog_data->uses_firstvertex || vs_prog_data->uses_baseinstance) ?
ANV_DGC_DRAW_PARAM_BASE_INSTANCE_VERTEX : 0) |
(vs_prog_data->uses_drawid ? ANV_DGC_DRAW_PARAM_DRAW_ID : 0);
}
anv_foreach_vk_stage(vk_stage, ANV_GRAPHICS_STAGE_BITS) {
enum anv_dgc_stage gen_stage = anv_vk_stage_to_dgc_stage(vk_stage);
enum mesa_shader_stage stage = vk_to_mesa_shader_stage(vk_stage);

View file

@ -304,8 +304,7 @@ get_device_extensions(const struct anv_physical_device *device,
* buffer approach, at the expense of late preprocessing. But this is
* for later.
*/
.EXT_device_generated_commands = device->info.verx10 >= 125 ||
(device->info.ver >= 11 || ANV_DEBUG(EXPERIMENTAL)),
.EXT_device_generated_commands = device->info.verx10 >= 125 || ANV_DEBUG(EXPERIMENTAL),
.EXT_device_memory_report = true,
#ifdef VK_USE_PLATFORM_DISPLAY_KHR
.EXT_display_control = true,

View file

@ -163,8 +163,6 @@ struct intel_perf_query_result;
#define BINDING_TABLE_VIEW_SIZE (1u << 20)
#define BINDING_TABLE_POOL_DEFAULT_BLOCK_SIZE (4096)
#define HW_MAX_VBS 33
/* 3DSTATE_VERTEX_BUFFER supports 33 VBs, but before Gen11 we used 2
* for base & drawid SGVs */
static inline int

View file

@ -75,6 +75,8 @@
*/
#define MAX_BINDING_TABLE_SIZE 240
#define HW_MAX_VBS 33
/* 3DSTATE_VERTEX_BUFFER supports 33 VBs, but these limits are applied on Gen9
* graphics, where 2 VBs are reserved for base & drawid SGVs.
*/
@ -350,6 +352,11 @@ enum anv_dgc_push_slot_type {
ANV_DGC_PUSH_SLOT_TYPE_OTHER,
};
enum anv_dgc_draw_params {
ANV_DGC_DRAW_PARAM_BASE_INSTANCE_VERTEX = BITFIELD_BIT(0),
ANV_DGC_DRAW_PARAM_DRAW_ID = BITFIELD_BIT(1),
};
/**
* This structure holds prepacked HW instructions for a set of graphics
* shaders forming a pipeline . It is part of the command buffer temporary
@ -362,7 +369,12 @@ struct anv_dgc_gfx_descriptor {
uint32_t final_commands[20];
uint32_t final_commands_size;
uint32_t wa_18019110168_remapping_table_offset;
union {
/* Gfx12.5 only */
uint32_t wa_18019110168_remapping_table_offset;
/* Gfx9 only */
enum anv_dgc_draw_params draw_params;
};
struct {
struct anv_dgc_push_stage_state {

View file

@ -786,6 +786,21 @@ void genX(CmdExecuteGeneratedCommandsEXT)(
if (cmd_buffer->state.conditional_render_enabled)
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
#if GFX_VER == 9
/* Gfx9 has a VF cache issues (only considers the bottom 32bit of the VF
* buffer address), since we're likely to emit those in the DGC buffer,
* invalidate the cache here, further invalidation is emitted in the
* generated commands if needed.
*/
anv_add_pending_pipe_bits(cmd_buffer,
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT_KHR,
VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
"Gfx9 VF cache inval pre dgc exec");
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
#endif
/* If a shader runs, flush the data to make it visible to CS. */
if (params) {
anv_add_pending_pipe_bits(cmd_buffer,

View file

@ -110,7 +110,6 @@ genX(call_internal_shader)(nir_builder *b, enum anv_internal_kernel_name shader_
nir_imul_imm(b, load_compute_index(b), 4));
return sizeof(struct anv_memcpy_params);
#if GFX_VER >= 11
case ANV_INTERNAL_KERNEL_DGC_GFX_COMPUTE:
case ANV_INTERNAL_KERNEL_DGC_GFX_FRAGMENT:
genX(libanv_preprocess_gfx_generate)(
@ -186,7 +185,6 @@ genX(call_internal_shader)(nir_builder *b, enum anv_internal_kernel_name shader_
load_param(b, 32, struct anv_dgc_dump_params, n_dwords),
load_param(b, 64, struct anv_dgc_dump_params, call_addr));
return sizeof(struct anv_dgc_dump_params);
#endif /* GFX_VER >= 11 */
#if GFX_VERx10 >= 125
case ANV_INTERNAL_KERNEL_DGC_RT_COMPUTE: