anv/intel: add device generated commands shaders

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31384>
This commit is contained in:
Lionel Landwerlin 2024-05-24 11:22:49 +03:00 committed by Marge Bot
parent c85647b968
commit 1281e2b9a0
8 changed files with 1352 additions and 5 deletions

View file

@ -11,6 +11,7 @@
#endif
#include "compiler/shader_enums.h"
#include "util/bitscan.h"
#include "util/enum_operators.h"
#ifdef __cplusplus

View file

@ -14,9 +14,11 @@
#elif (GFX_VERx10 == 125)
# include "genxml/gen125_rt_cl_pack.h"
#elif (GFX_VERx10 == 200)
# include "genxml/gen200_rt_cl_pack.h"
# include "genxml/xe2_rt_cl_pack.h"
#elif (GFX_VERx10 == 300)
# include "genxml/gen300_rt_cl_pack.h"
# include "genxml/xe3_rt_cl_pack.h"
#elif (GFX_VERx10 == 350)
# include "genxml/xe3p_rt_cl_pack.h"
#else
# error "Need to add a pack header include for this gen"
#endif

View file

@ -128,15 +128,50 @@ endforeach
genX_cl_included_symbols = [
# instructions
'3DMESH_3D',
'3DSTATE_CLIP',
'3DSTATE_CONSTANT_ALL',
'3DSTATE_CONSTANT_VS',
'3DSTATE_DS',
'3DSTATE_GS',
'3DSTATE_HS',
'3DSTATE_INDEX_BUFFER',
'3DSTATE_MESH_CONTROL',
'3DSTATE_MESH_SHADER_DATA',
'3DSTATE_PS',
'3DSTATE_PS_EXTRA',
'3DSTATE_PS_BLEND',
'3DSTATE_RASTER',
'3DSTATE_SF',
'3DSTATE_STREAMOUT',
'3DSTATE_TASK_CONTROL',
'3DSTATE_TASK_SHADER_DATA',
'3DSTATE_TE',
'3DSTATE_VERTEX_BUFFERS',
'3DSTATE_VF_TOPOLOGY',
'3DSTATE_VFG',
'3DSTATE_VS',
'3DSTATE_WM',
'3DPRIMITIVE',
'3DPRIMITIVE_EXTENDED',
'COMPUTE_WALKER',
'GPGPU_WALKER',
'MEDIA_CURBE_LOAD',
'MEDIA_INTERFACE_DESCRIPTOR_LOAD',
'MEDIA_STATE_FLUSH',
'MI_ARB_CHECK',
'MI_BATCH_BUFFER_START',
'MI_STORE_DATA_IMM',
# structures
'3DSTATE_CONSTANT_ALL_DATA',
'3DSTATE_CONSTANT_BODY',
'BINDLESS_SHADER_RECORD',
'CALL_STACK_HANDLER',
'COMPUTE_WALKER_BODY',
'INTERFACE_DESCRIPTOR_DATA',
'POSTSYNC_DATA',
'RT_DISPATCH_GLOBALS',
'RT_SHADER_TABLE',
'VERTEX_BUFFER_STATE',
]

947
src/intel/shaders/dgc.cl Normal file
View file

@ -0,0 +1,947 @@
/*
* Copyright 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "libintel_shaders.h"
#include "dev/intel_wa.h"
#include "vulkan/anv_types.h"
#define HAS_STAGE(descriptor, stage) \
(((descriptor)->active_stages & \
BITFIELD_BIT(ANV_DGC_STAGE_##stage)) != 0)
#if GFX_VER >= 11
static void
merge_dwords(global void *dst, global void *src1, global void *src2, uint32_t n_dwords)
{
for (uint32_t i = 0; i < n_dwords; i += 4) {
if (n_dwords - i >= 4) {
*(global uint4 *)(dst + i * 4) = *(global uint4 *)(src1 + i * 4) |
*(global uint4 *)(src2 + i * 4) ;
} else if (n_dwords - i >= 3) {
*(global uint3 *)(dst + i * 4) = *(global uint3 *)(src1 + i * 4) |
*(global uint3 *)(src2 + i * 4) ;
} else if (n_dwords - i >= 2) {
*(global uint2 *)(dst + i * 4) = *(global uint2 *)(src1 + i * 4) |
*(global uint2 *)(src2 + i * 4) ;
} else {
*(global uint *)(dst + i * 4) = *(global uint *)(src1 + i * 4) |
*(global uint *)(src2 + i * 4) ;
}
}
}
#if GFX_VER >= 12
static uint32_t
write_3DSTATE_CONSTANT_ALL(global void *dst_ptr,
global void *push_data_addr,
global struct anv_dgc_push_stage_state *stage_state,
global struct anv_dgc_gfx_state *state,
uint32_t stage_enabled)
{
uint32_t n_slots = stage_state->legacy.n_slots;
struct GENX(3DSTATE_CONSTANT_ALL) v = {
GENX(3DSTATE_CONSTANT_ALL_header),
.DWordLength = GENX(3DSTATE_CONSTANT_ALL_length) -
GENX(3DSTATE_CONSTANT_ALL_length_bias) +
n_slots * GENX(3DSTATE_CONSTANT_ALL_DATA_length),
.ShaderUpdateEnable = stage_enabled,
.MOCS = state->layout.push_constants.mocs,
.PointerBufferMask = (1u << n_slots) - 1,
};
GENX(3DSTATE_CONSTANT_ALL_pack)(dst_ptr, &v);
dst_ptr += GENX(3DSTATE_CONSTANT_ALL_length) * 4;
for (uint32_t i = 0; i < n_slots; i++) {
struct anv_dgc_push_stage_slot slot = stage_state->legacy.slots[i];
if (slot.type == ANV_DGC_PUSH_SLOT_TYPE_PUSH_CONSTANTS) {
struct GENX(3DSTATE_CONSTANT_ALL_DATA) vd = {
.ConstantBufferReadLength = slot.push_data_size / 32,
.PointerToConstantBuffer = (uint64_t) push_data_addr + slot.push_data_offset,
};
GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(dst_ptr, &vd);
} else {
struct GENX(3DSTATE_CONSTANT_ALL_DATA) vd = {
.ConstantBufferReadLength = slot.push_data_size / 32,
.PointerToConstantBuffer = state->push_constants.addresses[i],
};
GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(dst_ptr, &vd);
}
dst_ptr += GENX(3DSTATE_CONSTANT_ALL_DATA_length) * 4;
}
return 4 * (GENX(3DSTATE_CONSTANT_ALL_length) +
n_slots * GENX(3DSTATE_CONSTANT_ALL_DATA_length));
}
#else
static uint64_t
pc_slot_address(global struct anv_dgc_push_stage_slot *slot,
global uint64_t *slot_address,
global void *push_data_addr)
{
if (slot->type == ANV_DGC_PUSH_SLOT_TYPE_PUSH_CONSTANTS) {
return (uint64_t) push_data_addr + slot->push_data_offset;
} else {
return *slot_address;
}
}
static uint32_t
write_3DSTATE_CONSTANT_XS(global void *dst_ptr,
global void *push_data_addr,
global struct anv_dgc_push_stage_state *stage_state,
global struct anv_dgc_gfx_state *state,
uint32_t stage_enabled)
{
uint32_t opcode;
if (stage_enabled & BITFIELD_BIT(ANV_DGC_STAGE_VERTEX))
opcode = 21;
else if (stage_enabled & BITFIELD_BIT(ANV_DGC_STAGE_TESS_CTRL))
opcode = 25;
else if (stage_enabled & BITFIELD_BIT(ANV_DGC_STAGE_TESS_EVAL))
opcode = 26;
else if (stage_enabled & BITFIELD_BIT(ANV_DGC_STAGE_GEOMETRY))
opcode = 22;
else
opcode = 23;
struct GENX(3DSTATE_CONSTANT_VS) v = {
GENX(3DSTATE_CONSTANT_VS_header),
._3DCommandSubOpcode = opcode,
.ConstantBody = {
.Buffer = {
pc_slot_address(&stage_state->legacy.slots[0],
&state->push_constants.addresses[0],
push_data_addr),
pc_slot_address(&stage_state->legacy.slots[1],
&state->push_constants.addresses[1],
push_data_addr),
pc_slot_address(&stage_state->legacy.slots[2],
&state->push_constants.addresses[2],
push_data_addr),
pc_slot_address(&stage_state->legacy.slots[3],
&state->push_constants.addresses[3],
push_data_addr),
},
.ReadLength = {
stage_state->legacy.slots[0].push_data_size / 32,
stage_state->legacy.slots[1].push_data_size / 32,
stage_state->legacy.slots[2].push_data_size / 32,
stage_state->legacy.slots[3].push_data_size / 32,
},
},
};
GENX(3DSTATE_CONSTANT_VS_pack)(dst_ptr, &v);
return 4 * GENX(3DSTATE_CONSTANT_VS_length);
}
#endif
static void
write_app_push_constant_data(global void *push_data_ptr,
global struct anv_dgc_push_layout *pc_layout,
global void *seq_ptr,
global void *template_ptr,
uint32_t template_size,
uint32_t seq_idx)
{
uint32_t num_entries = pc_layout->num_entries;
/* Copy the push constant data prepared on the CPU into the preprocess
* buffer. Try to minimize the amount if the first entry partially or
* entirely overlaps.
*/
if (template_size > 0) {
if (num_entries > 0) {
struct anv_dgc_push_entry first_entry = pc_layout->entries[0];
uint32_t entry_end = first_entry.push_offset + first_entry.size;
if (first_entry.push_offset > 0) {
genX(copy_data)(push_data_ptr, template_ptr,
first_entry.push_offset);
}
if (entry_end < template_size) {
genX(copy_data)(push_data_ptr + entry_end,
template_ptr + entry_end,
template_size - entry_end);
}
} else {
genX(copy_data)(push_data_ptr, template_ptr, template_size);
}
}
/* Update push constant data using the indirect stream */
for (uint32_t i = 0; i < num_entries; i++) {
struct anv_dgc_push_entry entry = pc_layout->entries[i];
global void *pc_ptr = seq_ptr + entry.seq_offset;
genX(copy_data)(push_data_ptr + entry.push_offset,
pc_ptr, entry.size);
}
if (pc_layout->seq_id_active)
*(uint32_t *)(push_data_ptr + pc_layout->seq_id_offset) = seq_idx;
}
static void
write_cs_drv_push_constant_data(global struct anv_push_constants *push_data_ptr,
global void *driver_template_ptr,
uint32_t offset, uint32_t size,
global VkDispatchIndirectCommand *info)
{
genX(copy_data)(&push_data_ptr->client_data[offset],
driver_template_ptr, size);
#if GFX_VERx10 >= 125
/* On Gfx12.5+ we always have the entire push constant space, so it's fine to copy */
push_data_ptr->cs.num_workgroups[0] = info->x;
push_data_ptr->cs.num_workgroups[1] = info->y;
push_data_ptr->cs.num_workgroups[2] = info->z;
#else
/* Prior to Gfx12.5, the push constant data has to be aligned to 64B and
* the beginning is based off the first location the shader needs. So if
* the read location is does not include the workgroup, don't write it, we
* would be overwriting some other data in the generated commands/data.
*/
if (offset <= offsetof(struct anv_push_constants, cs.num_workgroups[0])) {
push_data_ptr->cs.num_workgroups[0] = info->x;
push_data_ptr->cs.num_workgroups[1] = info->y;
push_data_ptr->cs.num_workgroups[2] = info->z;
}
#endif
}
static void
write_rt_drv_push_constant_data(global void *driver_data_ptr,
global void *driver_template_ptr,
uint32_t size)
{
genX(copy_data)(driver_data_ptr, driver_template_ptr, size);
}
static void
write_gfx_drv_push_constant_data(global void *driver_data_ptr,
global void *driver_template_ptr,
uint32_t size)
{
genX(copy_data)(driver_data_ptr, driver_template_ptr, size);
}
static uint32_t
write_gfx_push_constant_commands(global void *push_cmd_ptr,
global void *push_data_ptr,
global struct anv_dgc_gfx_state *state)
{
uint32_t cmd_offset = 0;
uint32_t push_stages = state->descriptor.push_constants.active_stages;
for (uint32_t s = ANV_DGC_STAGE_VERTEX;
s <= ANV_DGC_STAGE_FRAGMENT && push_stages != 0; s++) {
if ((BITFIELD_BIT(s) & push_stages) == 0)
continue;
global struct anv_dgc_push_stage_state *stage_state =
&state->descriptor.push_constants.stages[s];
#if GFX_VER >= 12
cmd_offset += write_3DSTATE_CONSTANT_ALL(push_cmd_ptr + cmd_offset,
push_data_ptr,
stage_state,
state,
BITFIELD_BIT(s));
#else
cmd_offset += write_3DSTATE_CONSTANT_XS(push_cmd_ptr + cmd_offset,
push_data_ptr,
stage_state,
state,
BITFIELD_BIT(s));
#endif
push_stages &= ~BITFIELD_BIT(s);
}
#if GFX_VERx10 >= 125
/* Mesh & Task use a single combined push constants + driver constants
* pointer
*/
if (push_stages & BITFIELD_BIT(ANV_DGC_STAGE_TASK)) {
struct anv_dgc_push_bindless_stage pc =
state->descriptor.push_constants.stages[ANV_DGC_STAGE_TASK].bindless;
uint64_t pc_addr = (uint64_t) push_data_ptr + pc.push_data_offset;
struct GENX(3DSTATE_TASK_SHADER_DATA) data = {
GENX(3DSTATE_TASK_SHADER_DATA_header),
.InlineData = {
pc.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ?
pc_addr & 0xffffffff : ((global uint32_t *)push_data_ptr)[pc.inline_dwords[0]],
pc.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ?
pc_addr >> 32 : ((global uint32_t *)push_data_ptr)[pc.inline_dwords[1]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[2]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[3]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[4]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[5]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[6]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[7]],
},
};
GENX(3DSTATE_TASK_SHADER_DATA_pack)(push_cmd_ptr + cmd_offset, &data);
cmd_offset += GENX(3DSTATE_TASK_SHADER_DATA_length) * 4;
}
if (push_stages & BITFIELD_BIT(ANV_DGC_STAGE_MESH)) {
struct anv_dgc_push_bindless_stage pc =
state->descriptor.push_constants.stages[ANV_DGC_STAGE_MESH].bindless;
uint64_t pc_addr = (uint64_t) push_data_ptr + pc.push_data_offset;
struct GENX(3DSTATE_MESH_SHADER_DATA) data = {
GENX(3DSTATE_MESH_SHADER_DATA_header),
.InlineData = {
pc.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ? pc_addr & 0xffffffff :
((global uint32_t *)push_data_ptr)[pc.inline_dwords[0]],
pc.inline_dwords[1] == ANV_INLINE_DWORD_PUSH_ADDRESS_UDW ? pc_addr >> 32 :
((global uint32_t *)push_data_ptr)[pc.inline_dwords[1]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[2]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[3]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[4]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[5]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[6]],
((global uint32_t *)push_data_ptr)[pc.inline_dwords[7]],
},
};
GENX(3DSTATE_MESH_SHADER_DATA_pack)(push_cmd_ptr + cmd_offset, &data);
cmd_offset += GENX(3DSTATE_MESH_SHADER_DATA_length) * 4;
#undef PVDW_OR
}
#endif
return cmd_offset;
}
static global void *
get_ptr(global void *base, uint32_t stride,
uint32_t prolog_size, uint32_t seq_idx)
{
return base + prolog_size + seq_idx * stride;
}
static void
write_prolog_epilog(global void *cmd_base, uint32_t cmd_stride,
uint32_t max_count, uint32_t cmd_prolog_size,
uint32_t seq_idx, uint64_t return_addr)
{
/* A write to the location of the MI_BATCH_BUFFER_START below. */
genX(write_address)(cmd_base,
get_ptr(cmd_base, cmd_stride,
cmd_prolog_size, max_count) + 4,
return_addr);
global void *next_addr = cmd_base + (GENX(MI_STORE_DATA_IMM_length) + 1 +
GENX(MI_BATCH_BUFFER_START_length)) * 4;
genX(write_MI_BATCH_BUFFER_START)(
cmd_base + (GENX(MI_STORE_DATA_IMM_length) + 1) * 4,
(uint64_t)next_addr);
/* Reenable the prefetcher. */
#if GFX_VER >= 12
struct GENX(MI_ARB_CHECK) v = {
GENX(MI_ARB_CHECK_header),
/* This is a trick to get the CLC->SPIRV not to use a constant variable
* for this. Otherwise we run into issues trying to store that variable
* in constant memory which is inefficient for a single dword and also
* not handled in our backend.
*/
.PreParserDisableMask = seq_idx == 0,
.PreParserDisable = false,
};
GENX(MI_ARB_CHECK_pack)(next_addr, &v);
#endif
/* This is the epilog, returning to the main batch. */
genX(write_MI_BATCH_BUFFER_START)(
get_ptr(cmd_base, cmd_stride, cmd_prolog_size, max_count),
return_addr);
}
static void
write_return_addr(global void *cmd_base, uint32_t cmd_stride,
uint32_t max_count, uint32_t cmd_prolog_size,
uint64_t return_addr)
{
/* A write to the location of the MI_BATCH_BUFFER_START below. */
genX(write_address)(cmd_base,
get_ptr(cmd_base, cmd_stride,
cmd_prolog_size, max_count) + 4,
return_addr);
}
void
genX(libanv_preprocess_gfx_generate)(global void *cmd_base,
uint32_t cmd_stride,
global void *data_base,
uint32_t data_stride,
global void *seq_base,
uint32_t seq_stride,
global uint32_t *seq_count,
uint32_t max_seq_count,
uint32_t cmd_prolog_size,
uint32_t data_prolog_size,
global struct anv_dgc_gfx_state *state,
global void *const_ptr,
uint32_t const_size,
global void *driver_const_ptr,
uint64_t return_addr,
uint32_t flags,
uint32_t seq_idx)
{
uint32_t max_count = seq_count != 0 ? min(*seq_count, max_seq_count) : max_seq_count;
if (seq_idx == 0) {
write_prolog_epilog(cmd_base, cmd_stride, max_count,
cmd_prolog_size, seq_idx, return_addr);
}
if (seq_idx >= max_count)
return;
/* Pointer to the stream data, layed out as described in stream_layout. */
global void *seq_ptr = seq_base + seq_idx * seq_stride;
/* Where to write the commands */
global void *cmd_ptr =
get_ptr(cmd_base, cmd_stride, cmd_prolog_size, seq_idx);
/* 3DSTATE_INDEX_BUFFER */
struct anv_dgc_index_buffer index_buffer = state->layout.index_buffer;
if (index_buffer.cmd_size != 0) {
VkBindIndexBufferIndirectCommandEXT idx_data =
*(global VkBindIndexBufferIndirectCommandEXT *)(
seq_ptr + index_buffer.seq_offset);
uint32_t index_format =
index_buffer.u32_value == idx_data.indexType ? INDEX_DWORD :
index_buffer.u16_value == idx_data.indexType ? INDEX_WORD :
index_buffer.u8_value == idx_data.indexType ? INDEX_BYTE :
INDEX_BYTE;
genX(write_3DSTATE_INDEX_BUFFER)(cmd_ptr + index_buffer.cmd_offset,
idx_data.bufferAddress,
idx_data.size,
index_format,
index_buffer.mocs);
}
/* 3DSTATE_VERTEX_BUFFERS */
uint32_t n_vertex_buffers = state->layout.vertex_buffers.n_buffers;
if (n_vertex_buffers) {
global void *cmd_vb = cmd_ptr + state->layout.vertex_buffers.cmd_offset;
genX(write_3DSTATE_VERTEX_BUFFERS)(cmd_vb, n_vertex_buffers);
cmd_vb += 4;
uint16_t mocs = state->layout.vertex_buffers.mocs;
for (uint32_t i = 0; i < n_vertex_buffers; i++) {
struct anv_dgc_vertex_buffer vb = state->layout.vertex_buffers.buffers[i];
VkBindVertexBufferIndirectCommandEXT vtx_data =
*(global VkBindVertexBufferIndirectCommandEXT *)(
seq_ptr + vb.seq_offset);
genX(write_VERTEX_BUFFER_STATE)(cmd_vb, mocs, vb.binding,
vtx_data.bufferAddress,
vtx_data.size,
vtx_data.stride);
cmd_vb += GENX(VERTEX_BUFFER_STATE_length) * 4;
}
}
#if INTEL_WA_16011107343_GFX_VER || INTEL_WA_22018402687_GFX_VER
genX(copy_data)(cmd_ptr + state->layout.indirect_set.final_cmds_offset,
state->descriptor.final_commands,
state->layout.indirect_set.final_cmds_size);
#endif
/* Push constants */
enum anv_dgc_push_constant_flags pc_flags =
state->layout.push_constants.flags;
if (pc_flags & ANV_DGC_PUSH_CONSTANTS_CMD_ACTIVE) {
global void *push_data_ptr =
get_ptr(data_base, data_stride, data_prolog_size, seq_idx) +
state->layout.push_constants.data_offset;
write_app_push_constant_data(push_data_ptr,
&state->layout.push_constants,
seq_ptr, const_ptr,
const_size, seq_idx);
write_gfx_drv_push_constant_data(
push_data_ptr + MAX_PUSH_CONSTANTS_SIZE,
driver_const_ptr, ANV_DRIVER_PUSH_CONSTANTS_SIZE);
write_gfx_push_constant_commands(cmd_ptr +
state->layout.push_constants.cmd_offset,
push_data_ptr,
state);
}
/* 3DPRIMITIVE / 3DMESH_3D */
bool is_predicated = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0;
bool tbimr_enabled = (flags & ANV_GENERATED_FLAG_TBIMR) != 0;
switch (state->layout.draw.draw_type) {
case ANV_DGC_DRAW_TYPE_SEQUENTIAL:
genX(write_draw)(cmd_ptr + state->layout.draw.cmd_offset,
seq_ptr + state->layout.draw.seq_offset,
0 /* draw_id_ptr */,
0 /* draw_id, always 0 per spec */,
state->draw.instance_multiplier,
false /* indexed */,
is_predicated,
tbimr_enabled,
true /* uses_base, unused for Gfx11+ */,
true /* uses_draw_id, unused for Gfx11+ */,
0 /* mocs, unused for Gfx11+ */);
break;
case ANV_DGC_DRAW_TYPE_INDEXED:
genX(write_draw)(cmd_ptr + state->layout.draw.cmd_offset,
seq_ptr + state->layout.draw.seq_offset,
0 /* draw_id_ptr */,
0 /* draw_id, always 0 per spec */,
state->draw.instance_multiplier,
true /* indexed */,
is_predicated,
tbimr_enabled,
true /* uses_base, unused for Gfx11+ */,
true /* uses_draw_id, unused for Gfx11+ */,
0 /* mocs, unused for Gfx11+ */);
break;
#if GFX_VERx10 >= 125
case ANV_DGC_DRAW_TYPE_MESH:
genX(write_3DMESH_3D)(cmd_ptr + state->layout.draw.cmd_offset,
seq_ptr + state->layout.draw.seq_offset,
is_predicated,
tbimr_enabled);
break;
#endif
}
}
#if GFX_VERx10 >= 125
static void
emit_dispatch_commands(global void *cmd_base,
uint32_t cmd_stride,
uint32_t seq_idx,
uint32_t prolog_size,
global void *push_data_ptr,
global struct anv_dgc_cs_layout *layout,
global struct anv_dgc_cs_descriptor *descriptor,
global void *interface_descriptor_data_ptr,
uint32_t flags,
global VkDispatchIndirectCommand *info)
{
global void *cmd_ptr = get_ptr(cmd_base, cmd_stride, prolog_size, seq_idx);
uint64_t pc_addr = (uint64_t)push_data_ptr + descriptor->push_data_offset;
struct GENX(COMPUTE_WALKER) v = {
.PredicateEnable = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0,
.body = {
.ThreadGroupIDXDimension = info->x,
.ThreadGroupIDYDimension = info->y,
.ThreadGroupIDZDimension = info->z,
.ExecutionMask = descriptor->right_mask,
.InlineData = {
descriptor->gfx125.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ?
pc_addr & 0xffffffff : ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[0]],
descriptor->gfx125.inline_dwords[0] == ANV_INLINE_DWORD_PUSH_ADDRESS_LDW ?
pc_addr >> 32 : ((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[1]],
((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[2]],
((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[3]],
((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[4]],
((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[5]],
((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[6]],
((global uint32_t *)push_data_ptr)[descriptor->gfx125.inline_dwords[7]],
},
},
};
GENX(COMPUTE_WALKER_repack)(cmd_ptr, descriptor->gfx125.compute_walker, &v);
}
#else
static void
emit_dispatch_commands(global void *cmd_base,
uint32_t cmd_stride,
uint32_t seq_idx,
uint32_t cmd_prolog_size,
global void *data_ptr,
global struct anv_dgc_cs_layout *layout,
global struct anv_dgc_cs_descriptor *descriptor,
global void *interface_descriptor_data_ptr,
uint32_t flags,
global VkDispatchIndirectCommand *info)
{
global void *cmd_ptr = get_ptr(cmd_base, cmd_stride, cmd_prolog_size, seq_idx);
if (layout->indirect_set.active != 0) {
/* Emit MEDIA_VFE_STATE either for each sequence */
genX(copy_data)(cmd_ptr, descriptor->gfx9.media_vfe_state,
sizeof(descriptor->gfx9.media_vfe_state));
cmd_ptr += sizeof(descriptor->gfx9.media_vfe_state);
/* Load the shader descriptor */
global void *idd_ptr = data_ptr + layout->indirect_set.data_offset;
merge_dwords(idd_ptr,
interface_descriptor_data_ptr,
descriptor->gfx9.interface_descriptor_data,
GENX(INTERFACE_DESCRIPTOR_DATA_length));
uint32_t idd_offset =
ANV_DYNAMIC_VISIBLE_HEAP_OFFSET + ((uint64_t)idd_ptr) & 0xffffffff;
struct GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD) mdd = {
GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD_header),
.InterfaceDescriptorTotalLength = GENX(INTERFACE_DESCRIPTOR_DATA_length) * 4,
.InterfaceDescriptorDataStartAddress = idd_offset,
};
GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD_pack)(cmd_ptr, &mdd);
cmd_ptr += GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD_length) * 4;
}
/* Push constant offset relative to the dynamic state heap */
uint32_t dyn_push_data_offset =
ANV_DYNAMIC_VISIBLE_HEAP_OFFSET + (((uint64_t)data_ptr) & 0xffffffff);
struct GENX(MEDIA_CURBE_LOAD) mdl = {
GENX(MEDIA_CURBE_LOAD_header),
.CURBETotalDataLength = descriptor->gfx9.cross_thread_push_size +
descriptor->gfx9.n_threads *
descriptor->gfx9.per_thread_push_size,
.CURBEDataStartAddress = dyn_push_data_offset,
};
GENX(MEDIA_CURBE_LOAD_pack)(cmd_ptr, &mdl);
cmd_ptr += GENX(MEDIA_CURBE_LOAD_length) * 4;
/* Emit the walker */
struct GENX(GPGPU_WALKER) walker = {
.PredicateEnable = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0,
.SIMDSize = descriptor->simd_size / 16,
.ThreadWidthCounterMaximum = descriptor->threads - 1,
.RightExecutionMask = descriptor->right_mask,
.BottomExecutionMask = 0xffffffff,
.ThreadGroupIDXDimension = info->x,
.ThreadGroupIDYDimension = info->y,
.ThreadGroupIDZDimension = info->z,
};
GENX(GPGPU_WALKER_repack)(cmd_ptr, descriptor->gfx9.gpgpu_walker, &walker);
global uint32_t *walker_ptr = cmd_ptr;
cmd_ptr += GENX(GPGPU_WALKER_length) * 4;
uint32_t per_thread_push_size = descriptor->gfx9.per_thread_push_size;
if (per_thread_push_size > 0) {
uint32_t cross_thread_push_size = descriptor->gfx9.cross_thread_push_size;
global void *per_thread_ptr0 = data_ptr + cross_thread_push_size;
global void *per_thread_ptr = per_thread_ptr0;
for (uint32_t t = 0; t < descriptor->gfx9.n_threads; t++) {
if (t > 0) {
genX(copy_data)(per_thread_ptr, per_thread_ptr0,
per_thread_push_size);
}
*(uint32_t*)(per_thread_ptr + descriptor->gfx9.subgroup_id_offset) = t;
per_thread_ptr += per_thread_push_size;
}
}
struct GENX(MEDIA_STATE_FLUSH) flush = {
GENX(MEDIA_STATE_FLUSH_header),
};
GENX(MEDIA_STATE_FLUSH_pack)(cmd_ptr, &flush);
}
#endif
void
genX(libanv_preprocess_cs_generate)(global void *cmd_base,
uint32_t cmd_stride,
global void *data_base,
uint32_t data_stride,
global void *seq_base,
uint32_t seq_stride,
global uint32_t *seq_count,
uint32_t max_seq_count,
uint32_t cmd_prolog_size,
uint32_t data_prolog_size,
global struct anv_dgc_cs_layout *layout,
global struct anv_dgc_cs_descriptor *indirect_set,
global void *interface_descriptor_data_ptr,
global void *const_ptr,
uint32_t const_size,
global void *driver_const_ptr,
uint64_t return_addr,
uint32_t flags,
uint32_t seq_idx)
{
uint32_t max_count = seq_count != 0 ? min(*seq_count, max_seq_count) : max_seq_count;
if (seq_idx == 0) {
write_prolog_epilog(cmd_base, cmd_stride, max_count,
cmd_prolog_size, seq_idx, return_addr);
}
if (seq_idx >= max_count)
return;
/* Pointer to the application generated data, layed out as described in
* stream_layout.
*/
global void *seq_ptr = seq_base + seq_idx * seq_stride;
/* Get the shader descriptor. */
global struct anv_dgc_cs_descriptor *descriptor;
if (layout->indirect_set.active != 0) {
uint32_t set_idx = *(global uint32_t *)(seq_ptr + layout->indirect_set.seq_offset);
descriptor = &indirect_set[set_idx];
} else {
descriptor = indirect_set;
}
/* Prepare the push constant data. */
uint32_t push_data_offset = descriptor->push_data_offset;
/* */
global void *push_data_ptr =
get_ptr(data_base, data_stride, data_prolog_size, seq_idx) +
layout->push_constants.data_offset;
#if GFX_VERx10 >= 125
write_app_push_constant_data(
push_data_ptr, &layout->push_constants,
seq_ptr, const_ptr, const_size, seq_idx);
write_cs_drv_push_constant_data(
push_data_ptr, driver_const_ptr,
MAX_PUSH_CONSTANTS_SIZE,
ANV_DRIVER_PUSH_CONSTANTS_SIZE,
seq_ptr + layout->dispatch.seq_offset);
#else
write_app_push_constant_data(
push_data_ptr, &layout->push_constants,
seq_ptr, const_ptr, const_size, seq_idx);
write_cs_drv_push_constant_data(
push_data_ptr - descriptor->push_data_offset, driver_const_ptr,
MAX2(descriptor->push_data_offset, MAX_PUSH_CONSTANTS_SIZE),
MIN2(ANV_DRIVER_PUSH_CONSTANTS_SIZE,
(MAX_PUSH_CONSTANTS_SIZE + ANV_DRIVER_PUSH_CONSTANTS_SIZE) -
descriptor->push_data_offset),
seq_ptr + layout->dispatch.seq_offset);
#endif
/* Finally write the commands */
emit_dispatch_commands(cmd_base, cmd_stride, seq_idx, cmd_prolog_size,
push_data_ptr, layout, descriptor,
interface_descriptor_data_ptr, flags,
seq_ptr + layout->dispatch.seq_offset);
}
void
genX(libanv_postprocess_cs_generate)(global void *cmd_base,
uint32_t cmd_stride,
global void *data_base,
uint32_t data_stride,
global uint32_t *seq_count,
uint32_t max_seq_count,
uint32_t cmd_prolog_size,
uint32_t data_prolog_size,
uint32_t data_idd_offset,
global struct anv_dgc_cs_descriptor *descriptor,
uint64_t return_addr,
uint32_t seq_idx)
{
uint32_t max_count = seq_count != 0 ? min(*seq_count, max_seq_count) : max_seq_count;
if (seq_idx == 0) {
write_prolog_epilog(cmd_base, cmd_stride, max_count,
cmd_prolog_size, seq_idx, return_addr);
}
if (seq_idx >= max_count)
return;
/* Where to write the commands */
global void *cmd_ptr =
get_ptr(cmd_base, cmd_stride, cmd_prolog_size, seq_idx);
/* OR the driver INTERFACE_DESCRIPTOR_DATA dwords with the device generated
* ones.
*/
uint32_t n_dwords = 2; /* dwords covered from
* INTERFACE_DESCRIPTOR_DATA::SamplerCount to
* INTERFACE_DESCRIPTOR_DATA::BindingTablePointer
*/
#if GFX_VERx10 >= 125
uint32_t idd_offset_B = 12 /* offset in INTERFACE_DESCRIPTOR_DATA */;
uint32_t csw_body_offset_B = (GFX_VERx10 >= 200 ? 72 : 68) /* offset in COMPUTE_WALKER_BODY */;
uint32_t csw_offset_B = 4 /* offset in COMPUTE_WALKER */;
uint32_t inst_offset_B = csw_offset_B + csw_body_offset_B + idd_offset_B;
merge_dwords(cmd_ptr + inst_offset_B,
cmd_ptr + inst_offset_B,
&descriptor->gfx125.compute_walker[inst_offset_B / 4],
n_dwords);
#else
global void *idd_ptr =
get_ptr(data_base, data_stride, data_prolog_size, seq_idx) +
data_idd_offset;
uint32_t inst_offset_B = 12 /* offset in INTERFACE_DESCRIPTOR_DATA */;
merge_dwords(idd_ptr + inst_offset_B,
idd_ptr + inst_offset_B,
&descriptor->gfx9.interface_descriptor_data[inst_offset_B / 4],
n_dwords);
#endif
}
#if GFX_VERx10 >= 125
static uint3
calc_local_trace_size(uint3 global_size)
{
unsigned total_shift = 0;
uint3 local_shift = (uint3)(0, 0, 0);
bool progress;
do {
progress = false;
for (unsigned i = 0; i < 3; i++) {
if ((1 << local_shift[i]) < global_size[i]) {
progress = true;
local_shift[i]++;
total_shift++;
}
if (total_shift == 3)
return local_shift;
}
} while (progress);
/* Assign whatever's left to x */
local_shift[0] += 3 - total_shift;
return local_shift;
}
void
genX(libanv_preprocess_rt_generate)(global void *cmd_base,
uint32_t cmd_stride,
global void *data_base,
uint32_t data_stride,
global void *seq_base,
uint32_t seq_stride,
global uint32_t *seq_count,
uint32_t max_seq_count,
uint32_t cmd_prolog_size,
uint32_t data_prolog_size,
global struct anv_dgc_cs_layout *layout,
global void *compute_walker_template,
global void *rtdg_global_template,
global void *const_ptr,
uint32_t const_size,
global void *driver_const_ptr,
uint64_t return_addr,
uint32_t flags,
uint32_t seq_idx)
{
uint32_t max_count = seq_count != 0 ? min(*seq_count, max_seq_count) : max_seq_count;
if (seq_idx == 0) {
write_prolog_epilog(cmd_base, cmd_stride, max_count,
cmd_prolog_size, seq_idx, return_addr);
}
if (seq_idx >= max_count)
return;
/* Where to write the commands */
global void *cmd_ptr =
get_ptr(cmd_base, cmd_stride, cmd_prolog_size, seq_idx);
/* Pointer to the application generated data, layed out as described in
* stream_layout.
*/
global void *seq_ptr = seq_base + seq_idx * seq_stride;
VkTraceRaysIndirectCommand2KHR *info =
((global VkTraceRaysIndirectCommand2KHR *)(seq_ptr + layout->dispatch.seq_offset));
uint3 launch_size = (uint3)(info->width, info->height, info->depth);
/* RTDG + push constants */
global void *push_data_ptr =
get_ptr(data_base, data_stride, data_prolog_size, seq_idx) +
layout->push_constants.data_offset;
global void *rtdg_ptr = push_data_ptr;
struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
.LaunchWidth = launch_size.x,
.LaunchHeight = launch_size.y,
.LaunchDepth = launch_size.z,
#if GFX_VER >= 30
.HitGroupStride = info->hitShaderBindingTableStride,
.HitGroupTable = info->hitShaderBindingTableAddress,
.MissGroupTable = info->missShaderBindingTableAddress,
.MissGroupStride = info->missShaderBindingTableStride,
.CallableGroupTable = info->callableShaderBindingTableAddress,
.CallableGroupStride = info->callableShaderBindingTableStride,
#else
.HitGroupTable = (struct GENX(RT_SHADER_TABLE)) {
.BaseAddress = info->hitShaderBindingTableAddress,
.Stride = info->hitShaderBindingTableStride,
},
.MissGroupTable = (struct GENX(RT_SHADER_TABLE)) {
.BaseAddress = info->missShaderBindingTableAddress,
.Stride = info->missShaderBindingTableStride,
},
.CallableGroupTable = (struct GENX(RT_SHADER_TABLE)) {
.BaseAddress = info->callableShaderBindingTableAddress,
.Stride = info->callableShaderBindingTableStride,
},
#endif
};
GENX(RT_DISPATCH_GLOBALS_repack)(rtdg_ptr, rtdg_global_template, &rtdg);
write_app_push_constant_data(
push_data_ptr + ANV_DGC_RT_GLOBAL_DISPATCH_SIZE,
&layout->push_constants,
seq_ptr, const_ptr, const_size, seq_idx);
write_rt_drv_push_constant_data(
push_data_ptr +
ANV_DGC_RT_GLOBAL_DISPATCH_SIZE +
MAX_PUSH_CONSTANTS_SIZE,
driver_const_ptr,
ANV_DRIVER_PUSH_CONSTANTS_SIZE);
uint3 local_size_log2 = calc_local_trace_size(launch_size);
uint3 one = 1;
uint3 local_size = one << local_size_log2;
uint3 global_size = DIV_ROUND_UP(launch_size, local_size);
/* Finally write the commands */
global uint64_t *sbt = (global uint64_t *)info->raygenShaderRecordAddress;
struct GENX(COMPUTE_WALKER) v = {
.PredicateEnable = (flags & ANV_GENERATED_FLAG_PREDICATED) != 0,
.body = {
.LocalXMaximum = (1u << local_size_log2.x) - 1,
.LocalYMaximum = (1u << local_size_log2.y) - 1,
.LocalZMaximum = (1u << local_size_log2.z) - 1,
.ThreadGroupIDXDimension = global_size.x,
.ThreadGroupIDYDimension = global_size.y,
.ThreadGroupIDZDimension = global_size.z,
/* See struct brw_rt_raygen_trampoline_params */
.InlineData = {
((uint64_t) rtdg_ptr) & 0xffffffff,
((uint64_t) rtdg_ptr) >> 32,
info->raygenShaderRecordAddress & 0xffffffff,
info->raygenShaderRecordAddress >> 32,
local_size_log2.x << 8 |
local_size_log2.y << 16 |
local_size_log2.z << 24,
},
},
};
GENX(COMPUTE_WALKER_repack)(cmd_ptr, compute_walker_template, &v);
}
#endif /* GFX_VERx10 >= 125 */
#endif /* GFX_VER >= 11 */

View file

@ -4,6 +4,31 @@
#include "libintel_shaders.h"
void genX(write_address)(global void *dst_ptr, global void *address, uint64_t value)
{
struct GENX(MI_STORE_DATA_IMM) v = {
GENX(MI_STORE_DATA_IMM_header),
.DWordLength = GENX(MI_STORE_DATA_IMM_length) -
GENX(MI_STORE_DATA_IMM_length_bias) + 1,
#if GFX_VER >= 12
.ForceWriteCompletionCheck = true,
#endif
.Address = (uint64_t)address,
.ImmediateData = value,
};
GENX(MI_STORE_DATA_IMM_pack)(dst_ptr, &v);
}
void genX(write_3DSTATE_VF_TOPOLOGY)(global void *dst_ptr,
uint32_t topology)
{
struct GENX(3DSTATE_VF_TOPOLOGY) v = {
GENX(3DSTATE_VF_TOPOLOGY_header),
.PrimitiveTopologyType = topology,
};
GENX(3DSTATE_VF_TOPOLOGY_pack)(dst_ptr, &v);
}
void genX(write_3DSTATE_VERTEX_BUFFERS)(global void *dst_ptr,
uint32_t buffer_count)
{
@ -38,6 +63,25 @@ void genX(write_VERTEX_BUFFER_STATE)(global void *dst_ptr,
GENX(VERTEX_BUFFER_STATE_pack)(dst_ptr, &v);
}
void genX(write_3DSTATE_INDEX_BUFFER)(global void *dst_ptr,
uint64_t buffer_addr,
uint32_t buffer_size,
uint32_t index_format,
uint32_t mocs)
{
struct GENX(3DSTATE_INDEX_BUFFER) v = {
GENX(3DSTATE_INDEX_BUFFER_header),
.MOCS = mocs,
.IndexFormat = index_format,
#if GFX_VER >= 12
.L3BypassDisable = true,
#endif
.BufferStartingAddress = buffer_addr,
.BufferSize = buffer_size,
};
GENX(3DSTATE_INDEX_BUFFER_pack)(dst_ptr, &v);
}
void genX(write_3DPRIMITIVE)(global void *dst_ptr,
bool is_predicated,
bool is_indexed,
@ -202,3 +246,25 @@ void genX(write_draw)(global uint32_t *dst_ptr,
}
#endif
}
#if GFX_VERx10 >= 125
void genX(write_3DMESH_3D)(global uint32_t *dst_ptr,
global void *indirect_ptr,
bool is_predicated,
bool uses_tbimr)
{
VkDrawMeshTasksIndirectCommandEXT data =
*((global VkDrawMeshTasksIndirectCommandEXT *)indirect_ptr);
struct GENX(3DMESH_3D) v = {
GENX(3DMESH_3D_header),
.TBIMREnabled = uses_tbimr,
.PredicateEnable = is_predicated,
.ThreadGroupCountX = data.groupCountX,
.ThreadGroupCountY = data.groupCountY,
.ThreadGroupCountZ = data.groupCountZ,
};
GENX(3DMESH_3D_pack)(dst_ptr, &v);
}
#endif

View file

@ -13,17 +13,22 @@
#include "util/macros.h"
#else
#include "compiler/intel_shader_enums.h"
#define _MESA_LIBCL_ASSERT_IGNORE 1
#else
#include "libcl_vk.h"
#include "genxml/gen_macros.h"
#include "genxml/genX_cl_pack.h"
#include "genxml/genX_rt_cl_pack.h"
#define PRAGMA_POISON(param)
#include "compiler/intel_shader_enums.h"
#define _3DPRIM_PATCHLIST(n) (0x20 + (n - 1))
#endif
#define ANV_GENERATED_MAX_VES (29)
/**
* Flags for generated_draws.cl
*/
@ -46,6 +51,10 @@ enum anv_generated_draw_flags {
ANV_GENERATED_FLAG_WA_16011107343 = BITFIELD_BIT(7),
/* Wa_22018402687 */
ANV_GENERATED_FLAG_WA_22018402687 = BITFIELD_BIT(8),
/* Wa_16014912113 */
ANV_GENERATED_FLAG_WA_16014912113 = BITFIELD_BIT(9),
/* Wa_18022330953 / Wa_22011440098 */
ANV_GENERATED_FLAG_WA_18022330953 = BITFIELD_BIT(10)
};
/**
@ -58,6 +67,9 @@ enum anv_generated_draw_flags {
#ifdef __OPENCL_VERSION__
void genX(write_address)(global void *dst_ptr,
global void *address, uint64_t value);
void genX(write_3DSTATE_VERTEX_BUFFERS)(global void *dst_ptr,
uint32_t buffer_count);
@ -68,6 +80,15 @@ void genX(write_VERTEX_BUFFER_STATE)(global void *dst_ptr,
uint32_t size,
uint32_t stride);
void genX(write_3DSTATE_INDEX_BUFFER)(global void *dst_ptr,
uint64_t buffer_addr,
uint32_t buffer_size,
uint32_t index_format,
uint32_t mocs);
void genX(write_3DSTATE_VF_TOPOLOGY)(global void *dst_ptr,
uint32_t topology);
void genX(write_3DPRIMITIVE)(global void *dst_ptr,
bool is_predicated,
bool is_indexed,
@ -93,6 +114,13 @@ void genX(write_3DPRIMITIVE_EXTENDED)(global void *dst_ptr,
uint32_t param_draw_id);
#endif
#if GFX_VERx10 >= 125
void genX(write_3DMESH_3D)(global uint32_t *dst_ptr,
global void *indirect_ptr,
bool is_predicated,
bool uses_tbimr);
#endif
void genX(write_MI_BATCH_BUFFER_START)(global void *dst_ptr, uint64_t addr);
void genX(write_draw)(global uint32_t *dst_ptr,
@ -112,6 +140,10 @@ void genX(copy_data)(global void *dst_ptr,
global void *src_ptr,
uint32_t size);
void genX(set_data)(global void *dst_ptr,
uint32_t data,
uint32_t size);
#endif /* __OPENCL_VERSION__ */
#endif /* _LIBANV_SHADERS_H_ */

View file

@ -20,6 +20,7 @@ endif
intel_shader_files = files(
'libintel_shaders.h',
'dgc.cl',
'generate.cl',
'generate_draws.cl',
'generate_draws_iris.cl',
@ -45,6 +46,7 @@ foreach gen : intel_shaders_gens
command : [
prog_mesa_clc,
intel_shader_files, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@', '--',
'-Wno-initializer-overrides',
'-DGFX_VERx10=@0@'.format(gen[0]),
'-I' + join_paths(meson.current_source_dir(), '.'),
'-I' + join_paths(dir_source_root, 'src/compiler/libcl'),

View file

@ -194,5 +194,267 @@ struct anv_push_constants {
};
};
#define ANV_DRIVER_PUSH_CONSTANTS_SIZE (sizeof(struct anv_push_constants) - MAX_PUSH_CONSTANTS_SIZE)
#define ANV_INLINE_DWORD_PUSH_ADDRESS_LDW (UINT8_MAX - 0)
#define ANV_INLINE_DWORD_PUSH_ADDRESS_UDW (UINT8_MAX - 1)
/* Location of the user visible part of the dynamic state heap (1GiB) */
#define ANV_DYNAMIC_VISIBLE_HEAP_OFFSET (1024 * 1024 * 1024)
/**
* Stage enum for generated commands
*/
enum anv_dgc_stage {
ANV_DGC_STAGE_VERTEX = 0,
ANV_DGC_STAGE_TESS_CTRL,
ANV_DGC_STAGE_TESS_EVAL,
ANV_DGC_STAGE_GEOMETRY,
ANV_DGC_STAGE_FRAGMENT,
ANV_DGC_STAGE_TASK,
ANV_DGC_STAGE_MESH,
ANV_DGC_STAGE_COMPUTE,
ANV_DGC_STAGE_RT,
ANV_DGC_STAGES,
};
#define ANV_DGC_N_GFX_STAGES (ANV_DGC_STAGE_MESH + 1)
enum anv_dgc_draw_type {
ANV_DGC_DRAW_TYPE_SEQUENTIAL,
ANV_DGC_DRAW_TYPE_INDEXED,
ANV_DGC_DRAW_TYPE_MESH,
};
#define ANV_DGC_RT_GLOBAL_DISPATCH_SIZE (128)
enum anv_dgc_push_constant_flags {
ANV_DGC_PUSH_CONSTANTS_CMD_ACTIVE = BITFIELD_BIT(0),
};
/**
* This structure represents the indirect data layout (in
* VkGeneratedCommandsInfoEXT::indirectAddress) for push constants
*/
struct anv_dgc_push_layout {
struct anv_dgc_push_entry {
/* Location of the data to copy in the indirect buffer */
uint32_t seq_offset;
/* Location where to write the data in anv_push_constants::client_data[]
*/
uint16_t push_offset;
/* Size of the data to copy */
uint16_t size;
} entries[32];
uint8_t flags; /* enum anv_dgc_push_constant_flags */
uint8_t num_entries;
uint8_t mocs;
/* Whether the sequence ID is active and at what offset we should write it
* in the push constant data
*/
uint16_t seq_id_active;
uint16_t seq_id_offset;
/* Offset of the push constant commands in the preprocessed buffer.
*/
uint16_t cmd_offset;
uint16_t cmd_size;
/* Offset of the data in the indirect buffer, relative to
* VkGeneratedCommandsInfoEXT::indirectAddress
*/
uint16_t data_offset;
};
/**
* This structure represents both the data layout (in
* VkGeneratedCommandsInfoEXT::indirectAddress) and the command layout in the
* preprocess buffer (in VkGeneratedCommandsInfoEXT::preprocessAddress) for
* graphics commands
*/
struct anv_dgc_gfx_layout {
struct anv_dgc_index_buffer {
uint16_t cmd_offset; /* Offset of 3DSTATE_INDEX_BUFFER */
uint16_t cmd_size;
uint16_t seq_offset; /* Offset of VkBindIndexBufferIndirectCommandEXT */
uint16_t mocs;
uint32_t u32_value;
uint32_t u16_value;
uint32_t u8_value;
} index_buffer;
struct {
struct anv_dgc_vertex_buffer {
uint16_t seq_offset; /* Offset of VkBindVertexBufferIndirectCommandEXT */
uint16_t binding;
} buffers[31];
uint16_t n_buffers;
uint16_t mocs;
uint16_t cmd_offset; /* Offset of 3DSTATE_VERTEX_BUFFERS */
uint16_t cmd_size;
} vertex_buffers;
struct anv_dgc_push_layout push_constants;
struct {
uint16_t final_cmds_offset;
uint16_t final_cmds_size;
uint32_t active;
} indirect_set;
struct {
uint16_t cmd_offset; /* Offset of 3DPRIMITIVE/3DMESH_3D */
uint16_t cmd_size;
uint16_t draw_type; /* anv_dgc_gfx_draw_type */
uint16_t seq_offset; /* Offset of :
* - VkDrawIndirectCommand
* - VkDrawIndexedIndirectCommand
* - VkDrawMeshTasksIndirectCommandEXT
*/
} draw;
};
/**
* This structure represents both the data layout (in
* VkGeneratedCommandsInfoEXT::indirectAddress) and the command layout in the
* preprocess buffer (in VkGeneratedCommandsInfoEXT::preprocessAddress) for
* compute commands
*/
struct anv_dgc_cs_layout {
struct anv_dgc_push_layout push_constants;
/* Location of the indirect execution set index */
struct {
uint32_t seq_offset;
uint16_t data_offset;
uint16_t active;
} indirect_set;
/* Offset of VkDispatchIndirectCommand */
struct {
uint32_t seq_offset;
uint16_t cmd_offset;
uint16_t pad;
} dispatch;
};
enum anv_dgc_push_slot_type {
ANV_DGC_PUSH_SLOT_TYPE_PUSH_CONSTANTS,
ANV_DGC_PUSH_SLOT_TYPE_OTHER,
};
/**
* This structure holds prepacked HW instructions for a set of graphics
* shaders forming a pipeline . It is part of the command buffer temporary
* memory.
*/
struct anv_dgc_gfx_descriptor {
/* Fully packed instructions ready to be copied directly into the
* preprocess buffer (for workarounds)
*/
uint32_t final_commands[20];
uint32_t final_commands_size;
uint32_t wa_18019110168_remapping_table_offset;
struct {
struct anv_dgc_push_stage_state {
union {
struct {
struct anv_dgc_push_stage_slot {
uint16_t push_data_offset;
uint16_t push_data_size;
uint32_t type; /* enum anv_dgc_push_slot_type */
} slots[4];
uint32_t n_slots;
} legacy;
struct anv_dgc_push_bindless_stage {
uint16_t push_data_offset;
uint16_t inline_dwords_count;
uint8_t inline_dwords[8];
} bindless;
};
} stages[ANV_DGC_N_GFX_STAGES];
uint32_t active_stages; /* Bitfield of anv_dgc_command_stage */
} push_constants;
};
/**
* This structure holds information about the graphics state for generation.
*/
struct anv_dgc_gfx_state {
struct anv_dgc_gfx_layout layout;
struct anv_dgc_gfx_descriptor descriptor;
struct {
uint64_t addresses[4];
} push_constants;
struct {
uint16_t instance_multiplier;
uint32_t flags; /* ANV_GENERATED_FLAG_* */
} draw;
};
/**
* This structure holds prepacked HW instructions for a compute shader. It is
* either located in the memory associated with VkIndirectExecutionSetEXT or
* part of the command buffer temporary memory if indirect execution set is
* not used.
*/
struct anv_dgc_cs_descriptor {
union {
struct {
uint32_t compute_walker[40];
uint32_t inline_dwords_count;
uint8_t inline_dwords[8];
} gfx125;
struct {
/* Needs to be the first field because
* MEDIA_INTERFACE_DESCRIPTOR_LOAD::InterfaceDescriptorDataStartAddress
* needs 64B alignment.
*/
uint32_t interface_descriptor_data[8];
uint32_t gpgpu_walker[15];
uint32_t media_vfe_state[9];
uint32_t n_threads;
uint16_t cross_thread_push_size;
uint8_t per_thread_push_size;
uint8_t subgroup_id_offset;
} gfx9;
};
uint32_t right_mask;
uint32_t threads;
uint32_t simd_size;
uint32_t push_data_offset;
/* Align the struct to 64B */
uint32_t pad[1];
};
/**
* This structure holds information for a ray tracing pipeline.
*/
struct anv_dgc_rt_indirect_descriptor {
uint32_t ray_stack_stride;
uint32_t stack_ids_per_dss;
uint32_t sw_stack_size;
uint64_t call_handler;
uint64_t hit_sbt;
uint64_t miss_sbt;
uint64_t callable_sbt;
};