mesa/src/amd/vulkan/radv_device_generated_commands.c
Samuel Pitoiset 4586451b2d radv: add missing SQTT markers when an indirect indexed draw is used with DGC
Since DGC preprocessing for IBO is supported, the driver generates
an indexed indirect draw but SQTT markers were missing and this
introduced complete non-sense in RGP captures.

Fixes: e59a16bbb8 ("radv: use an indirect draw when IBO isn't updated as part of DGC")
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28710>
2024-04-15 07:22:49 +00:00

2303 lines
94 KiB
C

/*
* Copyright © 2021 Google
*
* SPDX-License-Identifier: MIT
*/
#include "radv_device_generated_commands.h"
#include "meta/radv_meta.h"
#include "radv_entrypoints.h"
#include "ac_rgp.h"
#include "nir_builder.h"
#include "vk_common_entrypoints.h"
#include "vk_shader_module.h"
static void
radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout,
const struct radv_compute_pipeline *pipeline, uint32_t *cmd_size)
{
const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
const struct radv_physical_device *pdev = radv_device_physical(device);
/* dispatch */
*cmd_size += 5 * 4;
if (pipeline) {
struct radv_shader *cs = radv_get_shader(pipeline->base.shaders, MESA_SHADER_COMPUTE);
const struct radv_userdata_info *loc = radv_get_user_sgpr(cs, AC_UD_CS_GRID_SIZE);
if (loc->sgpr_idx != -1) {
if (device->load_grid_size_from_user_sgpr) {
/* PKT3_SET_SH_REG for immediate values */
*cmd_size += 5 * 4;
} else {
/* PKT3_SET_SH_REG for pointer */
*cmd_size += 4 * 4;
}
}
} else {
/* COMPUTE_PGM_{LO,RSRC1,RSRC2} */
*cmd_size += 7 * 4;
if (pdev->info.gfx_level >= GFX10) {
/* COMPUTE_PGM_RSRC3 */
*cmd_size += 3 * 4;
}
/* COMPUTE_{RESOURCE_LIMITS,NUM_THREADS_X} */
*cmd_size += 8 * 4;
/* Assume the compute shader needs grid size because we can't know the information for
* indirect pipelines.
*/
if (device->load_grid_size_from_user_sgpr) {
/* PKT3_SET_SH_REG for immediate values */
*cmd_size += 5 * 4;
} else {
/* PKT3_SET_SH_REG for pointer */
*cmd_size += 4 * 4;
}
}
if (device->sqtt.bo) {
/* sqtt markers */
*cmd_size += 8 * 3 * 4;
}
}
static void
radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layout,
const struct radv_graphics_pipeline *pipeline, uint32_t *cmd_size,
uint32_t *upload_size)
{
const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_shader *vs = radv_get_shader(pipeline->base.shaders, MESA_SHADER_VERTEX);
if (layout->bind_vbo_mask) {
*upload_size += 16 * util_bitcount(vs->info.vs.vb_desc_usage_mask);
/* One PKT3_SET_SH_REG for emitting VBO pointer (32-bit) */
*cmd_size += 3 * 4;
}
if (layout->binds_index_buffer) {
/* Index type write (normal reg write) + index buffer base write (64-bits, but special packet
* so only 1 word overhead) + index buffer size (again, special packet so only 1 word
* overhead)
*/
*cmd_size += (3 + 3 + 2) * 4;
}
if (layout->indexed) {
if (layout->binds_index_buffer) {
/* userdata writes + instance count + indexed draw */
*cmd_size += (5 + 2 + 5) * 4;
} else {
/* PKT3_SET_BASE + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */
*cmd_size += (4 + (pipeline->uses_drawid ? 10 : 5)) * 4;
}
} else {
if (layout->draw_mesh_tasks) {
/* userdata writes + instance count + non-indexed draw */
*cmd_size += (6 + 2 + (pdev->mesh_fast_launch_2 ? 5 : 3)) * 4;
} else {
/* userdata writes + instance count + non-indexed draw */
*cmd_size += (5 + 2 + 3) * 4;
}
}
if (device->sqtt.bo) {
/* sqtt markers */
*cmd_size += 5 * 3 * 4;
}
}
static void
radv_get_sequence_size(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline,
uint32_t *cmd_size, uint32_t *upload_size)
{
const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
*cmd_size = 0;
*upload_size = 0;
if (layout->push_constant_mask) {
bool need_copy = false;
if (pipeline) {
for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) {
if (!pipeline->shaders[i])
continue;
struct radv_userdata_locations *locs = &pipeline->shaders[i]->info.user_sgprs_locs;
if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
/* One PKT3_SET_SH_REG for emitting push constants pointer (32-bit) */
*cmd_size += 3 * 4;
need_copy = true;
}
if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0)
/* One PKT3_SET_SH_REG writing all inline push constants. */
*cmd_size += (3 * util_bitcount64(layout->push_constant_mask)) * 4;
}
} else {
/* Assume the compute shader needs both user SGPRs because we can't know the information
* for indirect pipelines.
*/
assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
*cmd_size += 3 * 4;
need_copy = true;
*cmd_size += (3 * util_bitcount64(layout->push_constant_mask)) * 4;
}
if (need_copy) {
*upload_size += align(layout->push_constant_size + 16 * layout->dynamic_offset_count, 16);
}
}
if (device->sqtt.bo) {
/* THREAD_TRACE_MARKER */
*cmd_size += 2 * 4;
}
if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
radv_get_sequence_size_graphics(layout, graphics_pipeline, cmd_size, upload_size);
} else {
assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
struct radv_compute_pipeline *compute_pipeline = pipeline ? radv_pipeline_to_compute(pipeline) : NULL;
radv_get_sequence_size_compute(layout, compute_pipeline, cmd_size);
}
}
static uint32_t
radv_align_cmdbuf_size(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
const uint32_t ib_alignment = pdev->info.ip[ip_type].ib_alignment;
return align(size, ib_alignment);
}
static unsigned
radv_dgc_preamble_cmdbuf_size(const struct radv_device *device)
{
return radv_align_cmdbuf_size(device, 16, AMD_IP_GFX);
}
static bool
radv_dgc_use_preamble(const VkGeneratedCommandsInfoNV *cmd_info)
{
/* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously
* a bit of a guess as it depends on the actual count which we don't know. */
return cmd_info->sequencesCountBuffer != VK_NULL_HANDLE && cmd_info->sequencesCount >= 64;
}
uint32_t
radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
{
VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
if (radv_dgc_use_preamble(cmd_info))
return radv_dgc_preamble_cmdbuf_size(device);
uint32_t cmd_size, upload_size;
radv_get_sequence_size(layout, pipeline, &cmd_size, &upload_size);
return radv_align_cmdbuf_size(device, cmd_size * cmd_info->sequencesCount, AMD_IP_GFX);
}
struct radv_dgc_params {
uint32_t cmd_buf_stride;
uint32_t cmd_buf_size;
uint32_t upload_stride;
uint32_t upload_addr;
uint32_t sequence_count;
uint32_t stream_stride;
uint64_t stream_addr;
/* draw info */
uint16_t draw_indexed;
uint16_t draw_params_offset;
uint16_t binds_index_buffer;
uint16_t vtx_base_sgpr;
uint32_t max_index_count;
uint8_t draw_mesh_tasks;
/* dispatch info */
uint32_t dispatch_initiator;
uint16_t dispatch_params_offset;
uint16_t grid_base_sgpr;
/* bind index buffer info. Valid if binds_index_buffer == true && draw_indexed */
uint16_t index_buffer_offset;
uint8_t vbo_cnt;
uint8_t const_copy;
/* Which VBOs are set in this indirect layout. */
uint32_t vbo_bind_mask;
uint16_t vbo_reg;
uint16_t const_copy_size;
uint64_t push_constant_mask;
uint32_t ibo_type_32;
uint32_t ibo_type_8;
uint16_t push_constant_shader_cnt;
uint8_t is_dispatch;
uint8_t use_preamble;
/* For conditional rendering on ACE. */
uint8_t predicating;
uint8_t predication_type;
uint64_t predication_va;
uint8_t bind_pipeline;
uint16_t pipeline_params_offset;
};
enum {
DGC_USES_DRAWID = 1u << 14,
DGC_USES_BASEINSTANCE = 1u << 15,
DGC_USES_GRID_SIZE = DGC_USES_BASEINSTANCE, /* Mesh shader only */
};
enum {
DGC_DYNAMIC_STRIDE = 1u << 15,
};
enum {
DGC_DESC_STREAM,
DGC_DESC_PREPARE,
DGC_DESC_PARAMS,
DGC_DESC_COUNT,
DGC_NUM_DESCS,
};
struct dgc_cmdbuf {
nir_def *descriptor;
nir_variable *offset;
enum amd_gfx_level gfx_level;
bool sqtt_enabled;
};
static void
dgc_emit(nir_builder *b, struct dgc_cmdbuf *cs, unsigned count, nir_def **values)
{
for (unsigned i = 0; i < count; i += 4) {
nir_def *offset = nir_load_var(b, cs->offset);
nir_def *store_val = nir_vec(b, values + i, MIN2(count - i, 4));
assert(store_val->bit_size >= 32);
nir_store_ssbo(b, store_val, cs->descriptor, offset, .access = ACCESS_NON_READABLE);
nir_store_var(b, cs->offset, nir_iadd_imm(b, offset, store_val->num_components * store_val->bit_size / 8), 0x1);
}
}
static void
dgc_emit1(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *value)
{
dgc_emit(b, cs, 1, &value);
}
#define load_param32(b, field) \
nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), .base = offsetof(struct radv_dgc_params, field), .range = 4)
#define load_param16(b, field) \
nir_ubfe_imm((b), \
nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
.base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \
(offsetof(struct radv_dgc_params, field) & 2) * 8, 16)
#define load_param8(b, field) \
nir_ubfe_imm((b), \
nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
.base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \
(offsetof(struct radv_dgc_params, field) & 3) * 8, 8)
#define load_param64(b, field) \
nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0), \
.base = offsetof(struct radv_dgc_params, field), .range = 8))
/* Pipeline metadata */
#define load_metadata32(b, field) \
nir_load_global( \
b, nir_iadd(b, pipeline_va, nir_imm_int64(b, offsetof(struct radv_compute_pipeline_metadata, field))), 4, 1, 32)
#define load_metadata64(b, field) \
nir_load_global( \
b, nir_iadd(b, pipeline_va, nir_imm_int64(b, offsetof(struct radv_compute_pipeline_metadata, field))), 4, 1, 64)
static nir_def *
nir_pkt3_base(nir_builder *b, unsigned op, nir_def *len, bool predicate)
{
len = nir_iand_imm(b, len, 0x3fff);
return nir_ior_imm(b, nir_ishl_imm(b, len, 16), PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate));
}
static nir_def *
nir_pkt3(nir_builder *b, unsigned op, nir_def *len)
{
return nir_pkt3_base(b, op, len, false);
}
static nir_def *
dgc_get_nop_packet(nir_builder *b, const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
if (pdev->info.gfx_ib_pad_with_type2) {
return nir_imm_int(b, PKT2_NOP_PAD);
} else {
return nir_imm_int(b, PKT3_NOP_PAD);
}
}
static void
dgc_emit_userdata_vertex(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vtx_base_sgpr, nir_def *first_vertex,
nir_def *first_instance, nir_def *drawid, const struct radv_device *device)
{
vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
nir_def *pkt_cnt = nir_imm_int(b, 1);
pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
nir_def *values[5] = {
nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt), nir_iand_imm(b, vtx_base_sgpr, 0x3FFF), first_vertex,
dgc_get_nop_packet(b, device), dgc_get_nop_packet(b, device),
};
values[3] = nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance), nir_bcsel(b, has_drawid, drawid, first_instance),
values[4]);
values[4] = nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, values[4]);
dgc_emit(b, cs, 5, values);
}
static void
dgc_emit_userdata_mesh(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vtx_base_sgpr, nir_def *x, nir_def *y,
nir_def *z, nir_def *drawid, const struct radv_device *device)
{
vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
nir_push_if(b, nir_ior(b, has_grid_size, has_drawid));
{
nir_def *pkt_cnt = nir_imm_int(b, 0);
pkt_cnt = nir_bcsel(b, has_grid_size, nir_iadd_imm(b, pkt_cnt, 3), pkt_cnt);
pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
nir_def *values[6] = {
nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt), nir_iand_imm(b, vtx_base_sgpr, 0x3FFF), dgc_get_nop_packet(b, device),
dgc_get_nop_packet(b, device), dgc_get_nop_packet(b, device), dgc_get_nop_packet(b, device),
};
/* DrawID needs to be first if no GridSize. */
values[2] = nir_bcsel(b, has_grid_size, x, drawid);
values[3] = nir_bcsel(b, has_grid_size, y, values[3]);
values[4] = nir_bcsel(b, has_grid_size, z, values[4]);
values[5] = nir_bcsel(b, has_drawid, drawid, values[5]);
dgc_emit(b, cs, ARRAY_SIZE(values), values);
}
nir_pop_if(b, NULL);
}
static void
dgc_emit_sqtt_userdata(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *data)
{
if (!cs->sqtt_enabled)
return;
nir_def *values[3] = {
nir_pkt3_base(b, PKT3_SET_UCONFIG_REG, nir_imm_int(b, 1), cs->gfx_level >= GFX10),
nir_imm_int(b, (R_030D08_SQ_THREAD_TRACE_USERDATA_2 - CIK_UCONFIG_REG_OFFSET) >> 2),
data,
};
dgc_emit(b, cs, 3, values);
}
static void
dgc_emit_sqtt_thread_trace_marker(nir_builder *b, struct dgc_cmdbuf *cs)
{
if (!cs->sqtt_enabled)
return;
nir_def *values[2] = {
nir_pkt3(b, PKT3_EVENT_WRITE, nir_imm_int(b, 0)),
nir_imm_int(b, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER | EVENT_INDEX(0))),
};
dgc_emit(b, cs, 2, values);
}
static void
dgc_emit_sqtt_marker_event(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *sequence_id,
enum rgp_sqtt_marker_event_type event)
{
struct rgp_sqtt_marker_event marker = {0};
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
marker.api_type = event;
dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.dword01));
dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.dword02));
dgc_emit_sqtt_userdata(b, cs, sequence_id);
}
static void
dgc_emit_sqtt_marker_event_with_dims(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *x,
nir_def *y, nir_def *z, enum rgp_sqtt_marker_event_type event)
{
struct rgp_sqtt_marker_event_with_dims marker = {0};
marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
marker.event.api_type = event;
marker.event.has_thread_dims = 1;
dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.event.dword01));
dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.event.dword02));
dgc_emit_sqtt_userdata(b, cs, sequence_id);
dgc_emit_sqtt_userdata(b, cs, x);
dgc_emit_sqtt_userdata(b, cs, y);
dgc_emit_sqtt_userdata(b, cs, z);
}
static void
dgc_emit_sqtt_begin_api_marker(nir_builder *b, struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general_api_type api_type)
{
struct rgp_sqtt_marker_general_api marker = {0};
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API;
marker.api_type = api_type;
dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.dword01));
}
static void
dgc_emit_sqtt_end_api_marker(nir_builder *b, struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general_api_type api_type)
{
struct rgp_sqtt_marker_general_api marker = {0};
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API;
marker.api_type = api_type;
marker.is_end = 1;
dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.dword01));
}
static void
dgc_emit_instance_count(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *instance_count)
{
nir_def *values[2] = {nir_imm_int(b, PKT3(PKT3_NUM_INSTANCES, 0, false)), instance_count};
dgc_emit(b, cs, 2, values);
}
static void
dgc_emit_draw_index_offset_2(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *index_offset, nir_def *index_count,
nir_def *max_index_count)
{
nir_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, false)), max_index_count, index_offset,
index_count, nir_imm_int(b, V_0287F0_DI_SRC_SEL_DMA)};
dgc_emit(b, cs, 5, values);
}
static void
dgc_emit_draw_index_auto(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vertex_count)
{
nir_def *values[3] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_AUTO, 1, false)), vertex_count,
nir_imm_int(b, V_0287F0_DI_SRC_SEL_AUTO_INDEX)};
dgc_emit(b, cs, 3, values);
}
static void
dgc_emit_dispatch_direct(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *wg_x, nir_def *wg_y, nir_def *wg_z,
nir_def *dispatch_initiator)
{
nir_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DISPATCH_DIRECT, 3, false) | PKT3_SHADER_TYPE_S(1)), wg_x, wg_y, wg_z,
dispatch_initiator};
dgc_emit(b, cs, 5, values);
}
static void
dgc_emit_dispatch_mesh_direct(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z)
{
nir_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, false)), x, y, z,
nir_imm_int(b, S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX))};
dgc_emit(b, cs, 5, values);
}
static void
dgc_emit_grid_size_user_sgpr(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *wg_x,
nir_def *wg_y, nir_def *wg_z)
{
nir_def *values[5] = {
nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 3, false)), grid_base_sgpr, wg_x, wg_y, wg_z,
};
dgc_emit(b, cs, 5, values);
}
static void
dgc_emit_grid_size_pointer(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *stream_offset)
{
nir_def *stream_addr = load_param64(b, stream_addr);
nir_def *va = nir_iadd(b, stream_addr, nir_u2u64(b, stream_offset));
nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
nir_def *values[4] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 2, false)), grid_base_sgpr, va_lo, va_hi};
dgc_emit(b, cs, 4, values);
}
static void
dgc_emit_pkt3_set_base(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *va)
{
nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
nir_def *values[4] = {nir_imm_int(b, PKT3(PKT3_SET_BASE, 2, false)), nir_imm_int(b, 1), va_lo, va_hi};
dgc_emit(b, cs, 4, values);
}
static void
dgc_emit_pkt3_draw_indirect(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vtx_base_sgpr, bool indexed)
{
const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
vtx_base_sgpr = nir_iand_imm(b, nir_u2u32(b, vtx_base_sgpr), 0x3FFF);
nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
/* vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2 */
nir_def *vertex_offset_reg = vtx_base_sgpr;
/* start_instance_reg = (base_reg + (draw_id_enable ? 8 : 4) - SI_SH_REG_OFFSET) >> 2 */
nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1));
nir_def *start_instance_reg = nir_iadd(b, vtx_base_sgpr, start_instance_offset);
/* draw_id_reg = (base_reg + 4 - SI_SH_REG_OFFSET) >> 2 */
nir_def *draw_id_reg = nir_iadd(b, vtx_base_sgpr, nir_imm_int(b, 1));
nir_if *if_drawid = nir_push_if(b, has_drawid);
{
const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI;
nir_def *values[8];
values[0] = nir_imm_int(b, PKT3(pkt3_op, 8, false));
values[1] = nir_imm_int(b, 0);
values[2] = vertex_offset_reg;
values[3] = nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0));
values[4] = nir_ior(b, draw_id_reg, nir_imm_int(b, S_2C3_DRAW_INDEX_ENABLE(1)));
values[5] = nir_imm_int(b, 1); /* draw count */
values[6] = nir_imm_int(b, 0); /* count va low */
values[7] = nir_imm_int(b, 0); /* count va high */
dgc_emit(b, cs, 8, values);
values[0] = nir_imm_int(b, 0); /* stride */
values[1] = nir_imm_int(b, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
dgc_emit(b, cs, 2, values);
}
nir_push_else(b, if_drawid);
{
const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT;
nir_def *values[5];
values[0] = nir_imm_int(b, PKT3(pkt3_op, 3, false));
values[1] = nir_imm_int(b, 0);
values[2] = vertex_offset_reg;
values[3] = nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0));
values[4] = nir_imm_int(b, di_src_sel);
dgc_emit(b, cs, 5, values);
}
nir_pop_if(b, if_drawid);
}
static void
dgc_emit_draw_indirect(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_base, nir_def *draw_params_offset,
nir_def *sequence_id, bool indexed)
{
nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
nir_def *stream_offset = nir_iadd(b, draw_params_offset, stream_base);
nir_def *stream_addr = load_param64(b, stream_addr);
nir_def *va = nir_iadd(b, stream_addr, nir_u2u64(b, stream_offset));
dgc_emit_sqtt_begin_api_marker(b, cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
dgc_emit_sqtt_marker_event(b, cs, sequence_id, indexed ? EventCmdDrawIndexedIndirect : EventCmdDrawIndirect);
dgc_emit_pkt3_set_base(b, cs, va);
dgc_emit_pkt3_draw_indirect(b, cs, vtx_base_sgpr, indexed);
dgc_emit_sqtt_thread_trace_marker(b, cs);
dgc_emit_sqtt_end_api_marker(b, cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
}
static nir_def *
dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
{
nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
nir_def *cmd_buf_size = load_param32(b, cmd_buf_size);
nir_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
nir_def *size = nir_imul(b, cmd_buf_stride, sequence_count);
unsigned align_mask = radv_align_cmdbuf_size(device, 1, AMD_IP_GFX) - 1;
size = nir_iand_imm(b, nir_iadd_imm(b, size, align_mask), ~align_mask);
/* Ensure we don't have to deal with a jump to an empty IB in the preamble. */
size = nir_imax(b, size, nir_imm_int(b, align_mask + 1));
return nir_bcsel(b, use_preamble, size, cmd_buf_size);
}
static nir_def *
dgc_main_cmd_buf_offset(nir_builder *b, const struct radv_device *device)
{
nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
nir_def *base_offset = nir_imm_int(b, radv_dgc_preamble_cmdbuf_size(device));
return nir_bcsel(b, use_preamble, base_offset, nir_imm_int(b, 0));
}
static void
build_dgc_buffer_tail(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_def *global_id = get_global_ids(b, 1);
nir_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, device);
nir_push_if(b, nir_ieq_imm(b, global_id, 0));
{
nir_def *base_offset = dgc_main_cmd_buf_offset(b, device);
nir_def *cmd_buf_tail_start = nir_imul(b, cmd_buf_stride, sequence_count);
nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
nir_store_var(b, offset, cmd_buf_tail_start, 0x1);
nir_def *dst_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PREPARE);
nir_push_loop(b);
{
nir_def *curr_offset = nir_load_var(b, offset);
const unsigned MAX_PACKET_WORDS = 0x3FFC;
nir_push_if(b, nir_ieq(b, curr_offset, cmd_buf_size));
{
nir_jump(b, nir_jump_break);
}
nir_pop_if(b, NULL);
nir_def *packet, *packet_size;
if (pdev->info.gfx_ib_pad_with_type2) {
packet_size = nir_imm_int(b, 4);
packet = nir_imm_int(b, PKT2_NOP_PAD);
} else {
packet_size = nir_isub(b, cmd_buf_size, curr_offset);
packet_size = nir_umin(b, packet_size, nir_imm_int(b, MAX_PACKET_WORDS * 4));
nir_def *len = nir_ushr_imm(b, packet_size, 2);
len = nir_iadd_imm(b, len, -2);
packet = nir_pkt3(b, PKT3_NOP, len);
}
nir_store_ssbo(b, packet, dst_buf, nir_iadd(b, curr_offset, base_offset), .access = ACCESS_NON_READABLE);
nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1);
}
nir_pop_loop(b, NULL);
}
nir_pop_if(b, NULL);
}
static void
build_dgc_buffer_preamble(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_def *global_id = get_global_ids(b, 1);
nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
nir_push_if(b, nir_iand(b, nir_ieq_imm(b, global_id, 0), use_preamble));
{
unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device);
nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, device);
nir_def *dst_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PREPARE);
nir_def *words = nir_ushr_imm(b, cmd_buf_size, 2);
nir_def *addr = nir_iadd_imm(b, load_param32(b, upload_addr), preamble_size);
nir_def *nop_packet = dgc_get_nop_packet(b, device);
nir_def *nop_packets[] = {
nop_packet,
nop_packet,
nop_packet,
nop_packet,
};
const unsigned jump_size = 16;
unsigned offset;
/* Do vectorized store if possible */
for (offset = 0; offset + 16 <= preamble_size - jump_size; offset += 16) {
nir_store_ssbo(b, nir_vec(b, nop_packets, 4), dst_buf, nir_imm_int(b, offset), .access = ACCESS_NON_READABLE);
}
for (; offset + 4 <= preamble_size - jump_size; offset += 4) {
nir_store_ssbo(b, nop_packet, dst_buf, nir_imm_int(b, offset), .access = ACCESS_NON_READABLE);
}
nir_def *chain_packets[] = {
nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)),
addr,
nir_imm_int(b, pdev->info.address32_hi),
nir_ior_imm(b, words, S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)),
};
nir_store_ssbo(b, nir_vec(b, chain_packets, 4), dst_buf, nir_imm_int(b, preamble_size - jump_size),
.access = ACCESS_NON_READABLE);
}
nir_pop_if(b, NULL);
}
/**
* Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV.
*/
static void
dgc_emit_draw(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
nir_def *draw_params_offset, nir_def *sequence_id, const struct radv_device *device)
{
nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
nir_def *stream_offset = nir_iadd(b, draw_params_offset, stream_base);
nir_def *draw_data0 = nir_load_ssbo(b, 4, 32, stream_buf, stream_offset);
nir_def *vertex_count = nir_channel(b, draw_data0, 0);
nir_def *instance_count = nir_channel(b, draw_data0, 1);
nir_def *vertex_offset = nir_channel(b, draw_data0, 2);
nir_def *first_instance = nir_channel(b, draw_data0, 3);
nir_push_if(b, nir_iand(b, nir_ine_imm(b, vertex_count, 0), nir_ine_imm(b, instance_count, 0)));
{
dgc_emit_sqtt_begin_api_marker(b, cs, ApiCmdDraw);
dgc_emit_sqtt_marker_event(b, cs, sequence_id, EventCmdDraw);
dgc_emit_userdata_vertex(b, cs, vtx_base_sgpr, vertex_offset, first_instance, sequence_id, device);
dgc_emit_instance_count(b, cs, instance_count);
dgc_emit_draw_index_auto(b, cs, vertex_count);
dgc_emit_sqtt_thread_trace_marker(b, cs);
dgc_emit_sqtt_end_api_marker(b, cs, ApiCmdDraw);
}
nir_pop_if(b, 0);
}
/**
* Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV.
*/
static void
dgc_emit_draw_indexed(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
nir_def *draw_params_offset, nir_def *sequence_id, nir_def *max_index_count,
const struct radv_device *device)
{
nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
nir_def *stream_offset = nir_iadd(b, draw_params_offset, stream_base);
nir_def *draw_data0 = nir_load_ssbo(b, 4, 32, stream_buf, stream_offset);
nir_def *draw_data1 = nir_load_ssbo(b, 1, 32, stream_buf, nir_iadd_imm(b, stream_offset, 16));
nir_def *index_count = nir_channel(b, draw_data0, 0);
nir_def *instance_count = nir_channel(b, draw_data0, 1);
nir_def *first_index = nir_channel(b, draw_data0, 2);
nir_def *vertex_offset = nir_channel(b, draw_data0, 3);
nir_def *first_instance = nir_channel(b, draw_data1, 0);
nir_push_if(b, nir_iand(b, nir_ine_imm(b, index_count, 0), nir_ine_imm(b, instance_count, 0)));
{
dgc_emit_sqtt_begin_api_marker(b, cs, ApiCmdDrawIndexed);
dgc_emit_sqtt_marker_event(b, cs, sequence_id, EventCmdDrawIndexed);
dgc_emit_userdata_vertex(b, cs, vtx_base_sgpr, vertex_offset, first_instance, sequence_id, device);
dgc_emit_instance_count(b, cs, instance_count);
dgc_emit_draw_index_offset_2(b, cs, first_index, index_count, max_index_count);
dgc_emit_sqtt_thread_trace_marker(b, cs);
dgc_emit_sqtt_end_api_marker(b, cs, ApiCmdDrawIndexed);
}
nir_pop_if(b, 0);
}
/**
* Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV.
*/
static void
dgc_emit_index_buffer(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
nir_def *index_buffer_offset, nir_def *ibo_type_32, nir_def *ibo_type_8,
nir_variable *max_index_count_var, const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_def *index_stream_offset = nir_iadd(b, index_buffer_offset, stream_base);
nir_def *data = nir_load_ssbo(b, 4, 32, stream_buf, index_stream_offset);
nir_def *vk_index_type = nir_channel(b, data, 3);
nir_def *index_type = nir_bcsel(b, nir_ieq(b, vk_index_type, ibo_type_32), nir_imm_int(b, V_028A7C_VGT_INDEX_32),
nir_imm_int(b, V_028A7C_VGT_INDEX_16));
index_type = nir_bcsel(b, nir_ieq(b, vk_index_type, ibo_type_8), nir_imm_int(b, V_028A7C_VGT_INDEX_8), index_type);
nir_def *index_size = nir_iand_imm(b, nir_ushr(b, nir_imm_int(b, 0x142), nir_imul_imm(b, index_type, 4)), 0xf);
nir_def *max_index_count = nir_udiv(b, nir_channel(b, data, 2), index_size);
nir_store_var(b, max_index_count_var, max_index_count, 0x1);
nir_def *cmd_values[3 + 2 + 3];
if (pdev->info.gfx_level >= GFX9) {
unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
if (pdev->info.gfx_level < GFX9 || (pdev->info.gfx_level == GFX9 && pdev->info.me_fw_version < 26))
opcode = PKT3_SET_UCONFIG_REG;
cmd_values[0] = nir_imm_int(b, PKT3(opcode, 1, 0));
cmd_values[1] = nir_imm_int(b, (R_03090C_VGT_INDEX_TYPE - CIK_UCONFIG_REG_OFFSET) >> 2 | (2u << 28));
cmd_values[2] = index_type;
} else {
cmd_values[0] = nir_imm_int(b, PKT3(PKT3_INDEX_TYPE, 0, 0));
cmd_values[1] = index_type;
cmd_values[2] = dgc_get_nop_packet(b, device);
}
nir_def *addr_upper = nir_channel(b, data, 1);
addr_upper = nir_ishr_imm(b, nir_ishl_imm(b, addr_upper, 16), 16);
cmd_values[3] = nir_imm_int(b, PKT3(PKT3_INDEX_BASE, 1, 0));
cmd_values[4] = nir_channel(b, data, 0);
cmd_values[5] = addr_upper;
cmd_values[6] = nir_imm_int(b, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
cmd_values[7] = max_index_count;
dgc_emit(b, cs, 8, cmd_values);
}
/**
* Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV.
*/
static nir_def *
dgc_get_push_constant_shader_cnt(nir_builder *b, nir_def *stream_buf, nir_def *stream_base,
nir_def *pipeline_params_offset)
{
nir_def *res1, *res2;
nir_push_if(b, nir_ieq_imm(b, load_param8(b, bind_pipeline), 1));
{
nir_def *stream_offset = nir_iadd(b, pipeline_params_offset, stream_base);
nir_def *pipeline_va = nir_load_ssbo(b, 1, 64, stream_buf, stream_offset);
res1 = nir_b2i32(b, nir_ine_imm(b, load_metadata32(b, push_const_sgpr), 0));
}
nir_push_else(b, 0);
{
res2 = load_param16(b, push_constant_shader_cnt);
}
nir_pop_if(b, 0);
return nir_if_phi(b, res1, res2);
}
static nir_def *
dgc_get_upload_sgpr(nir_builder *b, nir_def *stream_buf, nir_def *stream_base, nir_def *param_buf,
nir_def *param_offset, nir_def *cur_shader_idx, nir_def *pipeline_params_offset)
{
nir_def *res1, *res2;
nir_push_if(b, nir_ieq_imm(b, load_param8(b, bind_pipeline), 1));
{
nir_def *stream_offset = nir_iadd(b, pipeline_params_offset, stream_base);
nir_def *pipeline_va = nir_load_ssbo(b, 1, 64, stream_buf, stream_offset);
res1 = load_metadata32(b, push_const_sgpr);
}
nir_push_else(b, 0);
{
res2 = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd(b, param_offset, nir_imul_imm(b, cur_shader_idx, 12)));
}
nir_pop_if(b, 0);
nir_def *res = nir_if_phi(b, res1, res2);
return nir_ubfe_imm(b, res, 0, 16);
}
static nir_def *
dgc_get_inline_sgpr(nir_builder *b, nir_def *stream_buf, nir_def *stream_base, nir_def *param_buf,
nir_def *param_offset, nir_def *cur_shader_idx, nir_def *pipeline_params_offset)
{
nir_def *res1, *res2;
nir_push_if(b, nir_ieq_imm(b, load_param8(b, bind_pipeline), 1));
{
nir_def *stream_offset = nir_iadd(b, pipeline_params_offset, stream_base);
nir_def *pipeline_va = nir_load_ssbo(b, 1, 64, stream_buf, stream_offset);
res1 = load_metadata32(b, push_const_sgpr);
}
nir_push_else(b, 0);
{
res2 = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd(b, param_offset, nir_imul_imm(b, cur_shader_idx, 12)));
}
nir_pop_if(b, 0);
nir_def *res = nir_if_phi(b, res1, res2);
return nir_ubfe_imm(b, res, 16, 16);
}
static nir_def *
dgc_get_inline_mask(nir_builder *b, nir_def *stream_buf, nir_def *stream_base, nir_def *param_buf,
nir_def *param_offset, nir_def *cur_shader_idx, nir_def *pipeline_params_offset)
{
nir_def *res1, *res2;
nir_push_if(b, nir_ieq_imm(b, load_param8(b, bind_pipeline), 1));
{
nir_def *stream_offset = nir_iadd(b, pipeline_params_offset, stream_base);
nir_def *pipeline_va = nir_load_ssbo(b, 1, 64, stream_buf, stream_offset);
res1 = load_metadata64(b, inline_push_const_mask);
}
nir_push_else(b, 0);
{
nir_def *reg_info = nir_load_ssbo(
b, 2, 32, param_buf, nir_iadd(b, param_offset, nir_iadd_imm(b, nir_imul_imm(b, cur_shader_idx, 12), 4)));
res2 = nir_pack_64_2x32(b, nir_channels(b, reg_info, 0x3));
}
nir_pop_if(b, 0);
return nir_if_phi(b, res1, res2);
}
static nir_def *
dgc_push_constant_needs_copy(nir_builder *b, nir_def *stream_buf, nir_def *stream_base, nir_def *pipeline_params_offset)
{
nir_def *res1, *res2;
nir_push_if(b, nir_ieq_imm(b, load_param8(b, bind_pipeline), 1));
{
nir_def *stream_offset = nir_iadd(b, pipeline_params_offset, stream_base);
nir_def *pipeline_va = nir_load_ssbo(b, 1, 64, stream_buf, stream_offset);
res1 = nir_ine_imm(b, nir_ubfe_imm(b, load_metadata32(b, push_const_sgpr), 0, 16), 0);
}
nir_push_else(b, 0);
{
res2 = nir_ine_imm(b, load_param8(b, const_copy), 0);
}
nir_pop_if(b, 0);
return nir_if_phi(b, res1, res2);
}
static void
dgc_emit_push_constant(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
nir_def *pipeline_params_offset, nir_def *push_const_mask, nir_variable *upload_offset)
{
nir_def *vbo_cnt = load_param8(b, vbo_cnt);
nir_def *const_copy = dgc_push_constant_needs_copy(b, stream_buf, stream_base, pipeline_params_offset);
nir_def *const_copy_size = load_param16(b, const_copy_size);
nir_def *const_copy_words = nir_ushr_imm(b, const_copy_size, 2);
const_copy_words = nir_bcsel(b, const_copy, const_copy_words, nir_imm_int(b, 0));
nir_variable *idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "const_copy_idx");
nir_store_var(b, idx, nir_imm_int(b, 0), 0x1);
nir_def *param_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PARAMS);
nir_def *param_offset = nir_imul_imm(b, vbo_cnt, 24);
nir_def *param_offset_offset = nir_iadd_imm(b, param_offset, MESA_VULKAN_SHADER_STAGES * 12);
nir_def *param_const_offset =
nir_iadd_imm(b, param_offset, MAX_PUSH_CONSTANTS_SIZE + MESA_VULKAN_SHADER_STAGES * 12);
nir_push_loop(b);
{
nir_def *cur_idx = nir_load_var(b, idx);
nir_push_if(b, nir_uge(b, cur_idx, const_copy_words));
{
nir_jump(b, nir_jump_break);
}
nir_pop_if(b, NULL);
nir_variable *data = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");
nir_def *update = nir_iand(b, push_const_mask, nir_ishl(b, nir_imm_int64(b, 1), cur_idx));
update = nir_bcsel(b, nir_ult_imm(b, cur_idx, 64 /* bits in push_const_mask */), update, nir_imm_int64(b, 0));
nir_push_if(b, nir_ine_imm(b, update, 0));
{
nir_def *stream_offset =
nir_load_ssbo(b, 1, 32, param_buf, nir_iadd(b, param_offset_offset, nir_ishl_imm(b, cur_idx, 2)));
nir_def *new_data = nir_load_ssbo(b, 1, 32, stream_buf, nir_iadd(b, stream_base, stream_offset));
nir_store_var(b, data, new_data, 0x1);
}
nir_push_else(b, NULL);
{
nir_store_var(b, data,
nir_load_ssbo(b, 1, 32, param_buf, nir_iadd(b, param_const_offset, nir_ishl_imm(b, cur_idx, 2))),
0x1);
}
nir_pop_if(b, NULL);
nir_store_ssbo(b, nir_load_var(b, data), cs->descriptor,
nir_iadd(b, nir_load_var(b, upload_offset), nir_ishl_imm(b, cur_idx, 2)),
.access = ACCESS_NON_READABLE);
nir_store_var(b, idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
}
nir_pop_loop(b, NULL);
nir_variable *shader_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "shader_idx");
nir_store_var(b, shader_idx, nir_imm_int(b, 0), 0x1);
nir_def *shader_cnt = dgc_get_push_constant_shader_cnt(b, stream_buf, stream_base, pipeline_params_offset);
nir_push_loop(b);
{
nir_def *cur_shader_idx = nir_load_var(b, shader_idx);
nir_push_if(b, nir_uge(b, cur_shader_idx, shader_cnt));
{
nir_jump(b, nir_jump_break);
}
nir_pop_if(b, NULL);
nir_def *upload_sgpr = dgc_get_upload_sgpr(b, stream_buf, stream_base, param_buf, param_offset, cur_shader_idx,
pipeline_params_offset);
nir_def *inline_sgpr = dgc_get_inline_sgpr(b, stream_buf, stream_base, param_buf, param_offset, cur_shader_idx,
pipeline_params_offset);
nir_def *inline_mask = dgc_get_inline_mask(b, stream_buf, stream_base, param_buf, param_offset, cur_shader_idx,
pipeline_params_offset);
nir_push_if(b, nir_ine_imm(b, upload_sgpr, 0));
{
nir_def *pkt[3] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 1, 0)), upload_sgpr,
nir_iadd(b, load_param32(b, upload_addr), nir_load_var(b, upload_offset))};
dgc_emit(b, cs, 3, pkt);
}
nir_pop_if(b, NULL);
nir_push_if(b, nir_ine_imm(b, inline_sgpr, 0));
{
nir_store_var(b, idx, nir_imm_int(b, 0), 0x1);
nir_variable *pc_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "pc_idx");
nir_store_var(b, pc_idx, nir_imm_int(b, 0), 0x1);
nir_push_loop(b);
{
nir_def *cur_idx = nir_load_var(b, idx);
nir_push_if(b, nir_uge_imm(b, cur_idx, 64 /* bits in inline_mask */));
{
nir_jump(b, nir_jump_break);
}
nir_pop_if(b, NULL);
nir_def *l = nir_ishl(b, nir_imm_int64(b, 1), cur_idx);
nir_push_if(b, nir_ieq_imm(b, nir_iand(b, l, inline_mask), 0));
{
nir_store_var(b, idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
nir_jump(b, nir_jump_continue);
}
nir_pop_if(b, NULL);
nir_variable *data = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");
nir_def *update = nir_iand(b, push_const_mask, nir_ishl(b, nir_imm_int64(b, 1), cur_idx));
update =
nir_bcsel(b, nir_ult_imm(b, cur_idx, 64 /* bits in push_const_mask */), update, nir_imm_int64(b, 0));
nir_push_if(b, nir_ine_imm(b, update, 0));
{
nir_def *stream_offset =
nir_load_ssbo(b, 1, 32, param_buf, nir_iadd(b, param_offset_offset, nir_ishl_imm(b, cur_idx, 2)));
nir_def *new_data = nir_load_ssbo(b, 1, 32, stream_buf, nir_iadd(b, stream_base, stream_offset));
nir_store_var(b, data, new_data, 0x1);
nir_def *pkt[3] = {nir_pkt3(b, PKT3_SET_SH_REG, nir_imm_int(b, 1)),
nir_iadd(b, inline_sgpr, nir_load_var(b, pc_idx)), nir_load_var(b, data)};
dgc_emit(b, cs, 3, pkt);
}
nir_push_else(b, NULL);
{
nir_push_if(b, nir_ieq_imm(b, load_param8(b, bind_pipeline), 1));
{
/* For indirect pipeline binds, partial push constant updates can't be emitted
* when the DGC execute is called because there is no bound pipeline and they have
* to be emitted from the DGC prepare shader.
*/
nir_def *new_data =
nir_load_ssbo(b, 1, 32, param_buf, nir_iadd(b, param_const_offset, nir_ishl_imm(b, cur_idx, 2)));
nir_store_var(b, data, new_data, 0x1);
nir_def *pkt[3] = {nir_pkt3(b, PKT3_SET_SH_REG, nir_imm_int(b, 1)),
nir_iadd(b, inline_sgpr, nir_load_var(b, pc_idx)), nir_load_var(b, data)};
dgc_emit(b, cs, 3, pkt);
}
nir_pop_if(b, NULL);
}
nir_pop_if(b, NULL);
nir_store_var(b, idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
nir_store_var(b, pc_idx, nir_iadd_imm(b, nir_load_var(b, pc_idx), 1), 0x1);
}
nir_pop_loop(b, NULL);
}
nir_pop_if(b, NULL);
nir_store_var(b, shader_idx, nir_iadd_imm(b, cur_shader_idx, 1), 0x1);
}
nir_pop_loop(b, NULL);
}
/**
* For emitting VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV.
*/
static void
dgc_emit_vertex_buffer(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
nir_def *vbo_bind_mask, nir_variable *upload_offset, const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_def *vbo_cnt = load_param8(b, vbo_cnt);
nir_variable *vbo_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "vbo_idx");
nir_store_var(b, vbo_idx, nir_imm_int(b, 0), 0x1);
nir_push_loop(b);
{
nir_push_if(b, nir_uge(b, nir_load_var(b, vbo_idx), vbo_cnt));
{
nir_jump(b, nir_jump_break);
}
nir_pop_if(b, NULL);
nir_def *vbo_offset = nir_imul_imm(b, nir_load_var(b, vbo_idx), 16);
nir_variable *vbo_data = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uvec4_type(), "vbo_data");
nir_def *param_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PARAMS);
nir_store_var(b, vbo_data, nir_load_ssbo(b, 4, 32, param_buf, vbo_offset), 0xf);
nir_def *vbo_override =
nir_ine_imm(b, nir_iand(b, vbo_bind_mask, nir_ishl(b, nir_imm_int(b, 1), nir_load_var(b, vbo_idx))), 0);
nir_push_if(b, vbo_override);
{
nir_def *vbo_offset_offset =
nir_iadd(b, nir_imul_imm(b, vbo_cnt, 16), nir_imul_imm(b, nir_load_var(b, vbo_idx), 8));
nir_def *vbo_over_data = nir_load_ssbo(b, 2, 32, param_buf, vbo_offset_offset);
nir_def *stream_offset = nir_iadd(b, stream_base, nir_iand_imm(b, nir_channel(b, vbo_over_data, 0), 0x7FFF));
nir_def *stream_data = nir_load_ssbo(b, 4, 32, stream_buf, stream_offset);
nir_def *va = nir_pack_64_2x32(b, nir_trim_vector(b, stream_data, 2));
nir_def *size = nir_channel(b, stream_data, 2);
nir_def *stride = nir_channel(b, stream_data, 3);
nir_def *dyn_stride = nir_test_mask(b, nir_channel(b, vbo_over_data, 0), DGC_DYNAMIC_STRIDE);
nir_def *old_stride = nir_ubfe_imm(b, nir_channel(b, nir_load_var(b, vbo_data), 1), 16, 14);
stride = nir_bcsel(b, dyn_stride, stride, old_stride);
nir_def *use_per_attribute_vb_descs = nir_test_mask(b, nir_channel(b, vbo_over_data, 0), 1u << 31);
nir_variable *num_records =
nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "num_records");
nir_store_var(b, num_records, size, 0x1);
nir_push_if(b, use_per_attribute_vb_descs);
{
nir_def *attrib_end = nir_ubfe_imm(b, nir_channel(b, vbo_over_data, 1), 16, 16);
nir_def *attrib_index_offset = nir_ubfe_imm(b, nir_channel(b, vbo_over_data, 1), 0, 16);
nir_push_if(b, nir_ult(b, nir_load_var(b, num_records), attrib_end));
{
nir_store_var(b, num_records, nir_imm_int(b, 0), 0x1);
}
nir_push_else(b, NULL);
nir_push_if(b, nir_ieq_imm(b, stride, 0));
{
nir_store_var(b, num_records, nir_imm_int(b, 1), 0x1);
}
nir_push_else(b, NULL);
{
nir_def *r = nir_iadd(
b, nir_iadd_imm(b, nir_udiv(b, nir_isub(b, nir_load_var(b, num_records), attrib_end), stride), 1),
attrib_index_offset);
nir_store_var(b, num_records, r, 0x1);
}
nir_pop_if(b, NULL);
nir_pop_if(b, NULL);
nir_def *convert_cond = nir_ine_imm(b, nir_load_var(b, num_records), 0);
if (pdev->info.gfx_level == GFX9)
convert_cond = nir_imm_false(b);
else if (pdev->info.gfx_level != GFX8)
convert_cond = nir_iand(b, convert_cond, nir_ieq_imm(b, stride, 0));
nir_def *new_records =
nir_iadd(b, nir_imul(b, nir_iadd_imm(b, nir_load_var(b, num_records), -1), stride), attrib_end);
new_records = nir_bcsel(b, convert_cond, new_records, nir_load_var(b, num_records));
nir_store_var(b, num_records, new_records, 0x1);
}
nir_push_else(b, NULL);
{
if (pdev->info.gfx_level != GFX8) {
nir_push_if(b, nir_ine_imm(b, stride, 0));
{
nir_def *r = nir_iadd(b, nir_load_var(b, num_records), nir_iadd_imm(b, stride, -1));
nir_store_var(b, num_records, nir_udiv(b, r, stride), 0x1);
}
nir_pop_if(b, NULL);
}
}
nir_pop_if(b, NULL);
nir_def *rsrc_word3 = nir_channel(b, nir_load_var(b, vbo_data), 3);
if (pdev->info.gfx_level >= GFX10) {
nir_def *oob_select = nir_bcsel(b, nir_ieq_imm(b, stride, 0), nir_imm_int(b, V_008F0C_OOB_SELECT_RAW),
nir_imm_int(b, V_008F0C_OOB_SELECT_STRUCTURED));
rsrc_word3 = nir_iand_imm(b, rsrc_word3, C_008F0C_OOB_SELECT);
rsrc_word3 = nir_ior(b, rsrc_word3, nir_ishl_imm(b, oob_select, 28));
}
nir_def *va_hi = nir_iand_imm(b, nir_unpack_64_2x32_split_y(b, va), 0xFFFF);
stride = nir_iand_imm(b, stride, 0x3FFF);
nir_def *new_vbo_data[4] = {nir_unpack_64_2x32_split_x(b, va), nir_ior(b, nir_ishl_imm(b, stride, 16), va_hi),
nir_load_var(b, num_records), rsrc_word3};
nir_store_var(b, vbo_data, nir_vec(b, new_vbo_data, 4), 0xf);
}
nir_pop_if(b, NULL);
/* On GFX9, it seems bounds checking is disabled if both
* num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
* GFX10.3 but it doesn't hurt.
*/
nir_def *num_records = nir_channel(b, nir_load_var(b, vbo_data), 2);
nir_def *buf_va =
nir_iand_imm(b, nir_pack_64_2x32(b, nir_trim_vector(b, nir_load_var(b, vbo_data), 2)), (1ull << 48) - 1ull);
nir_push_if(b, nir_ior(b, nir_ieq_imm(b, num_records, 0), nir_ieq_imm(b, buf_va, 0)));
{
nir_def *new_vbo_data[4] = {nir_imm_int(b, 0), nir_imm_int(b, 0), nir_imm_int(b, 0), nir_imm_int(b, 0)};
nir_store_var(b, vbo_data, nir_vec(b, new_vbo_data, 4), 0xf);
}
nir_pop_if(b, NULL);
nir_def *upload_off = nir_iadd(b, nir_load_var(b, upload_offset), vbo_offset);
nir_store_ssbo(b, nir_load_var(b, vbo_data), cs->descriptor, upload_off, .access = ACCESS_NON_READABLE);
nir_store_var(b, vbo_idx, nir_iadd_imm(b, nir_load_var(b, vbo_idx), 1), 0x1);
}
nir_pop_loop(b, NULL);
nir_def *packet[3] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 1, 0)), load_param16(b, vbo_reg),
nir_iadd(b, load_param32(b, upload_addr), nir_load_var(b, upload_offset))};
dgc_emit(b, cs, 3, packet);
nir_store_var(b, upload_offset, nir_iadd(b, nir_load_var(b, upload_offset), nir_imul_imm(b, vbo_cnt, 16)), 0x1);
}
/**
* For emitting VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV.
*/
static nir_def *
dgc_get_grid_sgpr(nir_builder *b, nir_def *stream_buf, nir_def *stream_base, nir_def *pipeline_params_offset)
{
nir_def *res1, *res2;
nir_push_if(b, nir_ieq_imm(b, load_param8(b, bind_pipeline), 1));
{
nir_def *stream_offset = nir_iadd(b, pipeline_params_offset, stream_base);
nir_def *pipeline_va = nir_load_ssbo(b, 1, 64, stream_buf, stream_offset);
res1 = load_metadata32(b, grid_base_sgpr);
}
nir_push_else(b, 0);
{
res2 = load_param16(b, grid_base_sgpr);
}
nir_pop_if(b, 0);
return nir_if_phi(b, res1, res2);
}
static nir_def *
dgc_get_dispatch_initiator(nir_builder *b, nir_def *stream_buf, nir_def *stream_base, nir_def *pipeline_params_offset)
{
nir_def *res1, *res2;
nir_push_if(b, nir_ieq_imm(b, load_param8(b, bind_pipeline), 1));
{
nir_def *stream_offset = nir_iadd(b, pipeline_params_offset, stream_base);
nir_def *pipeline_va = nir_load_ssbo(b, 1, 64, stream_buf, stream_offset);
nir_def *dispatch_initiator = load_param32(b, dispatch_initiator);
nir_def *wave32 = nir_ieq_imm(b, load_metadata32(b, wave32), 1);
res1 = nir_bcsel(b, wave32, nir_ior_imm(b, dispatch_initiator, S_00B800_CS_W32_EN(1)), dispatch_initiator);
}
nir_push_else(b, 0);
{
res2 = load_param32(b, dispatch_initiator);
}
nir_pop_if(b, 0);
return nir_if_phi(b, res1, res2);
}
static void
dgc_emit_dispatch(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
nir_def *dispatch_params_offset, nir_def *pipeline_params_offset, nir_def *sequence_id,
const struct radv_device *device)
{
nir_def *stream_offset = nir_iadd(b, dispatch_params_offset, stream_base);
nir_def *dispatch_data = nir_load_ssbo(b, 3, 32, stream_buf, stream_offset);
nir_def *wg_x = nir_channel(b, dispatch_data, 0);
nir_def *wg_y = nir_channel(b, dispatch_data, 1);
nir_def *wg_z = nir_channel(b, dispatch_data, 2);
nir_def *grid_sgpr = dgc_get_grid_sgpr(b, stream_buf, stream_base, pipeline_params_offset);
nir_push_if(b, nir_ine_imm(b, grid_sgpr, 0));
{
if (device->load_grid_size_from_user_sgpr) {
dgc_emit_grid_size_user_sgpr(b, cs, grid_sgpr, wg_x, wg_y, wg_z);
} else {
dgc_emit_grid_size_pointer(b, cs, grid_sgpr, stream_offset);
}
}
nir_pop_if(b, 0);
nir_push_if(b, nir_iand(b, nir_ine_imm(b, wg_x, 0), nir_iand(b, nir_ine_imm(b, wg_y, 0), nir_ine_imm(b, wg_z, 0))));
{
dgc_emit_sqtt_begin_api_marker(b, cs, ApiCmdDispatch);
dgc_emit_sqtt_marker_event_with_dims(b, cs, sequence_id, wg_x, wg_y, wg_z, EventCmdDispatch);
nir_def *dispatch_initiator = dgc_get_dispatch_initiator(b, stream_buf, stream_base, pipeline_params_offset);
dgc_emit_dispatch_direct(b, cs, wg_x, wg_y, wg_z, dispatch_initiator);
dgc_emit_sqtt_thread_trace_marker(b, cs);
dgc_emit_sqtt_end_api_marker(b, cs, ApiCmdDispatch);
}
nir_pop_if(b, 0);
}
/**
* Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV.
*/
static void
dgc_emit_draw_mesh_tasks(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
nir_def *draw_params_offset, nir_def *sequence_id, const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
nir_def *stream_offset = nir_iadd(b, draw_params_offset, stream_base);
nir_def *draw_data = nir_load_ssbo(b, 3, 32, stream_buf, stream_offset);
nir_def *x = nir_channel(b, draw_data, 0);
nir_def *y = nir_channel(b, draw_data, 1);
nir_def *z = nir_channel(b, draw_data, 2);
nir_push_if(b, nir_iand(b, nir_ine_imm(b, x, 0), nir_iand(b, nir_ine_imm(b, y, 0), nir_ine_imm(b, z, 0))));
{
dgc_emit_sqtt_begin_api_marker(b, cs, ApiCmdDrawMeshTasksEXT);
dgc_emit_sqtt_marker_event(b, cs, sequence_id, EventCmdDrawMeshTasksEXT);
dgc_emit_userdata_mesh(b, cs, vtx_base_sgpr, x, y, z, sequence_id, device);
dgc_emit_instance_count(b, cs, nir_imm_int(b, 1));
if (pdev->mesh_fast_launch_2) {
dgc_emit_dispatch_mesh_direct(b, cs, x, y, z);
} else {
nir_def *vertex_count = nir_imul(b, x, nir_imul(b, y, z));
dgc_emit_draw_index_auto(b, cs, vertex_count);
}
dgc_emit_sqtt_thread_trace_marker(b, cs);
dgc_emit_sqtt_end_api_marker(b, cs, ApiCmdDrawMeshTasksEXT);
}
nir_pop_if(b, NULL);
}
/**
* Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NV.
*/
static void
dgc_emit_set_sh_reg_seq(nir_builder *b, struct dgc_cmdbuf *cs, unsigned reg, unsigned num)
{
nir_def *values[2] = {
nir_imm_int(b, PKT3(PKT3_SET_SH_REG, num, false)),
nir_imm_int(b, (reg - SI_SH_REG_OFFSET) >> 2),
};
dgc_emit(b, cs, 2, values);
}
static void
dgc_emit_bind_pipeline(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
nir_def *pipeline_params_offset, const struct radv_device *device)
{
const struct radv_physical_device *pdev = radv_device_physical(device);
nir_def *stream_offset = nir_iadd(b, pipeline_params_offset, stream_base);
nir_def *pipeline_va = nir_load_ssbo(b, 1, 64, stream_buf, stream_offset);
dgc_emit_set_sh_reg_seq(b, cs, R_00B830_COMPUTE_PGM_LO, 1);
dgc_emit1(b, cs, load_metadata32(b, shader_va));
dgc_emit_set_sh_reg_seq(b, cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
dgc_emit1(b, cs, load_metadata32(b, rsrc1));
dgc_emit1(b, cs, load_metadata32(b, rsrc2));
if (pdev->info.gfx_level >= GFX10) {
dgc_emit_set_sh_reg_seq(b, cs, R_00B8A0_COMPUTE_PGM_RSRC3, 1);
dgc_emit1(b, cs, load_metadata32(b, rsrc3));
}
dgc_emit_set_sh_reg_seq(b, cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 1);
dgc_emit1(b, cs, load_metadata32(b, compute_resource_limits));
dgc_emit_set_sh_reg_seq(b, cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
dgc_emit1(b, cs, load_metadata32(b, block_size_x));
dgc_emit1(b, cs, load_metadata32(b, block_size_y));
dgc_emit1(b, cs, load_metadata32(b, block_size_z));
}
static nir_def *
dgc_is_cond_render_enabled(nir_builder *b)
{
nir_def *res1, *res2;
nir_push_if(b, nir_ieq_imm(b, load_param8(b, predicating), 1));
{
nir_def *val = nir_load_global(b, load_param64(b, predication_va), 4, 1, 32);
/* By default, all rendering commands are discarded if the 32-bit value is zero. If the
* inverted flag is set, they are discarded if the value is non-zero.
*/
res1 = nir_ixor(b, nir_i2b(b, load_param8(b, predication_type)), nir_ine_imm(b, val, 0));
}
nir_push_else(b, 0);
{
res2 = nir_imm_bool(b, false);
}
nir_pop_if(b, 0);
return nir_if_phi(b, res1, res2);
}
static nir_shader *
build_dgc_prepare_shader(struct radv_device *dev)
{
const struct radv_physical_device *pdev = radv_device_physical(dev);
nir_builder b = radv_meta_init_shader(dev, MESA_SHADER_COMPUTE, "meta_dgc_prepare");
b.shader->info.workgroup_size[0] = 64;
nir_def *global_id = get_global_ids(&b, 1);
nir_def *sequence_id = global_id;
nir_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride);
nir_def *sequence_count = load_param32(&b, sequence_count);
nir_def *stream_stride = load_param32(&b, stream_stride);
nir_def *use_count = nir_iand_imm(&b, sequence_count, 1u << 31);
sequence_count = nir_iand_imm(&b, sequence_count, UINT32_MAX >> 1);
nir_def *cmd_buf_base_offset = dgc_main_cmd_buf_offset(&b, dev);
/* The effective number of draws is
* min(sequencesCount, sequencesCountBuffer[sequencesCountOffset]) when
* using sequencesCountBuffer. Otherwise it is sequencesCount. */
nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count");
nir_store_var(&b, count_var, sequence_count, 0x1);
nir_push_if(&b, nir_ine_imm(&b, use_count, 0));
{
nir_def *count_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_COUNT);
nir_def *cnt = nir_load_ssbo(&b, 1, 32, count_buf, nir_imm_int(&b, 0));
/* Must clamp count against the API count explicitly.
* The workgroup potentially contains more threads than maxSequencesCount from API,
* and we have to ensure these threads write NOP packets to pad out the IB. */
cnt = nir_umin(&b, cnt, sequence_count);
nir_store_var(&b, count_var, cnt, 0x1);
}
nir_pop_if(&b, NULL);
nir_push_if(&b, dgc_is_cond_render_enabled(&b));
{
/* Reset the number of sequences when conditional rendering is enabled in order to skip the
* entire shader and pad the cmdbuf with NOPs.
*/
nir_store_var(&b, count_var, nir_imm_int(&b, 0), 0x1);
}
nir_pop_if(&b, NULL);
sequence_count = nir_load_var(&b, count_var);
nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
{
struct dgc_cmdbuf cmd_buf = {
.descriptor = radv_meta_load_descriptor(&b, 0, DGC_DESC_PREPARE),
.offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset"),
.gfx_level = pdev->info.gfx_level,
.sqtt_enabled = !!dev->sqtt.bo,
};
nir_store_var(&b, cmd_buf.offset, nir_iadd(&b, nir_imul(&b, global_id, cmd_buf_stride), cmd_buf_base_offset), 1);
nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_stride);
nir_def *stream_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_STREAM);
nir_def *stream_base = nir_imul(&b, sequence_id, stream_stride);
nir_variable *upload_offset =
nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset");
nir_def *upload_offset_init = nir_iadd(&b, nir_iadd(&b, load_param32(&b, cmd_buf_size), cmd_buf_base_offset),
nir_imul(&b, load_param32(&b, upload_stride), sequence_id));
nir_store_var(&b, upload_offset, upload_offset_init, 0x1);
nir_def *vbo_bind_mask = load_param32(&b, vbo_bind_mask);
nir_push_if(&b, nir_ine_imm(&b, vbo_bind_mask, 0));
{
dgc_emit_vertex_buffer(&b, &cmd_buf, stream_buf, stream_base, vbo_bind_mask, upload_offset, dev);
}
nir_pop_if(&b, NULL);
nir_def *push_const_mask = load_param64(&b, push_constant_mask);
nir_push_if(&b, nir_ine_imm(&b, push_const_mask, 0));
{
dgc_emit_push_constant(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, pipeline_params_offset),
push_const_mask, upload_offset);
}
nir_pop_if(&b, 0);
nir_push_if(&b, nir_ieq_imm(&b, load_param8(&b, bind_pipeline), 1));
{
dgc_emit_bind_pipeline(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, pipeline_params_offset), dev);
}
nir_pop_if(&b, 0);
nir_push_if(&b, nir_ieq_imm(&b, load_param8(&b, is_dispatch), 0));
{
nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, draw_indexed), 0));
{
nir_def *draw_mesh_tasks = load_param8(&b, draw_mesh_tasks);
nir_push_if(&b, nir_ieq_imm(&b, draw_mesh_tasks, 0));
{
dgc_emit_draw(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, draw_params_offset), sequence_id,
dev);
}
nir_push_else(&b, NULL);
{
dgc_emit_draw_mesh_tasks(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, draw_params_offset),
sequence_id, dev);
}
nir_pop_if(&b, NULL);
}
nir_push_else(&b, NULL);
{
/* Emit direct draws when index buffers are also updated by DGC. Otherwise, emit
* indirect draws to remove the dependency on the cmdbuf state in order to enable
* preprocessing.
*/
nir_def *binds_index_buffer = nir_ine_imm(&b, load_param16(&b, binds_index_buffer), 0);
nir_push_if(&b, binds_index_buffer);
{
nir_variable *max_index_count_var =
nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count");
dgc_emit_index_buffer(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, index_buffer_offset),
load_param32(&b, ibo_type_32), load_param32(&b, ibo_type_8), max_index_count_var,
dev);
nir_def *max_index_count = nir_load_var(&b, max_index_count_var);
dgc_emit_draw_indexed(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, draw_params_offset),
sequence_id, max_index_count, dev);
}
nir_push_else(&b, NULL);
{
dgc_emit_draw_indirect(&b, &cmd_buf, stream_base, load_param16(&b, draw_params_offset), sequence_id,
true);
}
nir_pop_if(&b, NULL);
}
nir_pop_if(&b, NULL);
}
nir_push_else(&b, NULL);
{
dgc_emit_dispatch(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, dispatch_params_offset),
load_param16(&b, pipeline_params_offset), sequence_id, dev);
}
nir_pop_if(&b, NULL);
/* Pad the cmdbuffer if we did not use the whole stride */
nir_push_if(&b, nir_ine(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_end));
{
if (pdev->info.gfx_ib_pad_with_type2) {
nir_push_loop(&b);
{
nir_def *curr_offset = nir_load_var(&b, cmd_buf.offset);
nir_push_if(&b, nir_ieq(&b, curr_offset, cmd_buf_end));
{
nir_jump(&b, nir_jump_break);
}
nir_pop_if(&b, NULL);
nir_def *pkt = nir_imm_int(&b, PKT2_NOP_PAD);
dgc_emit1(&b, &cmd_buf, pkt);
}
nir_pop_loop(&b, NULL);
} else {
nir_def *cnt = nir_isub(&b, cmd_buf_end, nir_load_var(&b, cmd_buf.offset));
cnt = nir_ushr_imm(&b, cnt, 2);
cnt = nir_iadd_imm(&b, cnt, -2);
nir_def *pkt = nir_pkt3(&b, PKT3_NOP, cnt);
dgc_emit1(&b, &cmd_buf, pkt);
}
}
nir_pop_if(&b, NULL);
}
nir_pop_if(&b, NULL);
build_dgc_buffer_tail(&b, sequence_count, dev);
build_dgc_buffer_preamble(&b, sequence_count, dev);
return b.shader;
}
void
radv_device_finish_dgc_prepare_state(struct radv_device *device)
{
radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.dgc_prepare.pipeline,
&device->meta_state.alloc);
radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.dgc_prepare.p_layout,
&device->meta_state.alloc);
device->vk.dispatch_table.DestroyDescriptorSetLayout(
radv_device_to_handle(device), device->meta_state.dgc_prepare.ds_layout, &device->meta_state.alloc);
}
VkResult
radv_device_init_dgc_prepare_state(struct radv_device *device)
{
VkResult result;
nir_shader *cs = build_dgc_prepare_shader(device);
VkDescriptorSetLayoutCreateInfo ds_create_info = {.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
.flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
.bindingCount = DGC_NUM_DESCS,
.pBindings = (VkDescriptorSetLayoutBinding[]){
{.binding = DGC_DESC_STREAM,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = NULL},
{.binding = DGC_DESC_PREPARE,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = NULL},
{.binding = DGC_DESC_PARAMS,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = NULL},
{.binding = DGC_DESC_COUNT,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = NULL},
}};
result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &ds_create_info, &device->meta_state.alloc,
&device->meta_state.dgc_prepare.ds_layout);
if (result != VK_SUCCESS)
goto cleanup;
const VkPipelineLayoutCreateInfo leaf_pl_create_info = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 1,
.pSetLayouts = &device->meta_state.dgc_prepare.ds_layout,
.pushConstantRangeCount = 1,
.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct radv_dgc_params)},
};
result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info, &device->meta_state.alloc,
&device->meta_state.dgc_prepare.p_layout);
if (result != VK_SUCCESS)
goto cleanup;
VkPipelineShaderStageCreateInfo shader_stage = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = vk_shader_module_handle_from_nir(cs),
.pName = "main",
.pSpecializationInfo = NULL,
};
VkComputePipelineCreateInfo pipeline_info = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage = shader_stage,
.flags = 0,
.layout = device->meta_state.dgc_prepare.p_layout,
};
result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, &pipeline_info,
&device->meta_state.alloc, &device->meta_state.dgc_prepare.pipeline);
if (result != VK_SUCCESS)
goto cleanup;
cleanup:
ralloc_free(cs);
return result;
}
VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateIndirectCommandsLayoutNV(VkDevice _device, const VkIndirectCommandsLayoutCreateInfoNV *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkIndirectCommandsLayoutNV *pIndirectCommandsLayout)
{
VK_FROM_HANDLE(radv_device, device, _device);
struct radv_indirect_command_layout *layout;
size_t size = sizeof(*layout) + pCreateInfo->tokenCount * sizeof(VkIndirectCommandsLayoutTokenNV);
layout = vk_zalloc2(&device->vk.alloc, pAllocator, size, alignof(struct radv_indirect_command_layout),
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!layout)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
vk_object_base_init(&device->vk, &layout->base, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV);
layout->flags = pCreateInfo->flags;
layout->pipeline_bind_point = pCreateInfo->pipelineBindPoint;
layout->input_stride = pCreateInfo->pStreamStrides[0];
layout->token_count = pCreateInfo->tokenCount;
typed_memcpy(layout->tokens, pCreateInfo->pTokens, pCreateInfo->tokenCount);
layout->ibo_type_32 = VK_INDEX_TYPE_UINT32;
layout->ibo_type_8 = VK_INDEX_TYPE_UINT8_KHR;
for (unsigned i = 0; i < pCreateInfo->tokenCount; ++i) {
switch (pCreateInfo->pTokens[i].tokenType) {
case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV:
layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
break;
case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV:
layout->indexed = true;
layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
break;
case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV:
layout->dispatch_params_offset = pCreateInfo->pTokens[i].offset;
break;
case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV:
layout->binds_index_buffer = true;
layout->index_buffer_offset = pCreateInfo->pTokens[i].offset;
/* 16-bit is implied if we find no match. */
for (unsigned j = 0; j < pCreateInfo->pTokens[i].indexTypeCount; j++) {
if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT32)
layout->ibo_type_32 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
else if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT8_KHR)
layout->ibo_type_8 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
}
break;
case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV:
layout->bind_vbo_mask |= 1u << pCreateInfo->pTokens[i].vertexBindingUnit;
layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] = pCreateInfo->pTokens[i].offset;
if (pCreateInfo->pTokens[i].vertexDynamicStride)
layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] |= DGC_DYNAMIC_STRIDE;
break;
case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV: {
VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->pTokens[i].pushconstantPipelineLayout);
for (unsigned j = pCreateInfo->pTokens[i].pushconstantOffset / 4, k = 0;
k < pCreateInfo->pTokens[i].pushconstantSize / 4; ++j, ++k) {
layout->push_constant_mask |= 1ull << j;
layout->push_constant_offsets[j] = pCreateInfo->pTokens[i].offset + k * 4;
}
layout->push_constant_size = pipeline_layout->push_constant_size;
layout->dynamic_offset_count = pipeline_layout->dynamic_offset_count;
break;
}
case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV:
layout->draw_mesh_tasks = true;
layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
break;
case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NV:
layout->bind_pipeline = true;
layout->pipeline_params_offset = pCreateInfo->pTokens[i].offset;
break;
default:
unreachable("Unhandled token type");
}
}
if (!layout->indexed)
layout->binds_index_buffer = false;
*pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
return VK_SUCCESS;
}
VKAPI_ATTR void VKAPI_CALL
radv_DestroyIndirectCommandsLayoutNV(VkDevice _device, VkIndirectCommandsLayoutNV indirectCommandsLayout,
const VkAllocationCallbacks *pAllocator)
{
VK_FROM_HANDLE(radv_device, device, _device);
VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout);
if (!layout)
return;
vk_object_base_finish(&layout->base);
vk_free2(&device->vk.alloc, pAllocator, layout);
}
VKAPI_ATTR void VKAPI_CALL
radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice _device,
const VkGeneratedCommandsMemoryRequirementsInfoNV *pInfo,
VkMemoryRequirements2 *pMemoryRequirements)
{
VK_FROM_HANDLE(radv_device, device, _device);
const struct radv_physical_device *pdev = radv_device_physical(device);
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout);
VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline);
uint32_t cmd_stride, upload_stride;
radv_get_sequence_size(layout, pipeline, &cmd_stride, &upload_stride);
VkDeviceSize cmd_buf_size = radv_align_cmdbuf_size(device, cmd_stride * pInfo->maxSequencesCount, AMD_IP_GFX) +
radv_dgc_preamble_cmdbuf_size(device);
VkDeviceSize upload_buf_size = upload_stride * pInfo->maxSequencesCount;
pMemoryRequirements->memoryRequirements.memoryTypeBits = pdev->memory_types_32bit;
pMemoryRequirements->memoryRequirements.alignment =
MAX2(pdev->info.ip[AMD_IP_GFX].ib_alignment, pdev->info.ip[AMD_IP_COMPUTE].ib_alignment);
pMemoryRequirements->memoryRequirements.size =
align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment);
}
bool
radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
{
VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
/* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when
* the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's
* not possible to skip the execute DGC call (ie. no INDIRECT_PACKET)
*/
return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && !cmd_buffer->state.predicating;
}
static bool
radv_dgc_need_push_constants_copy(const struct radv_pipeline *pipeline)
{
for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) {
const struct radv_shader *shader = pipeline->shaders[i];
if (!shader)
continue;
const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0)
return true;
}
return false;
}
bool
radv_dgc_can_preprocess(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline)
{
if (!(layout->flags & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_NV))
return false;
/* From the Vulkan spec (1.3.269, chapter 32):
* "The bound descriptor sets and push constants that will be used with indirect command generation for the compute
* piplines must already be specified at the time of preprocessing commands with vkCmdPreprocessGeneratedCommandsNV.
* They must not change until the execution of indirect commands is submitted with vkCmdExecuteGeneratedCommandsNV."
*
* So we can always preprocess compute layouts.
*/
if (layout->pipeline_bind_point != VK_PIPELINE_BIND_POINT_COMPUTE) {
/* VBO binding (in particular partial VBO binding) uses some draw state which we don't generate at preprocess time
* yet. */
if (layout->bind_vbo_mask)
return false;
/* Do not preprocess when all push constants can't be inlined because they need to be copied
* to the upload BO.
*/
if (layout->push_constant_mask && radv_dgc_need_push_constants_copy(pipeline))
return false;
}
return true;
}
VKAPI_ATTR void VKAPI_CALL
radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
{
VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
if (!radv_dgc_can_preprocess(layout, pipeline))
return;
/* VK_EXT_conditional_rendering says that copy commands should not be
* affected by conditional rendering.
*/
const bool old_predicating = cmd_buffer->state.predicating;
cmd_buffer->state.predicating = false;
radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, false);
/* Restore conditional rendering. */
cmd_buffer->state.predicating = old_predicating;
}
/* Always need to call this directly before draw due to dependence on bound state. */
static void
radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
unsigned *upload_size, unsigned *upload_offset, void **upload_data,
struct radv_dgc_params *params)
{
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
struct radv_shader *vs = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_VERTEX);
unsigned vb_size = layout->bind_vbo_mask ? util_bitcount(vs->info.vs.vb_desc_usage_mask) * 24 : 0;
*upload_size = MAX2(*upload_size + vb_size, 16);
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
return;
}
uint16_t vtx_base_sgpr = 0;
if (cmd_buffer->state.graphics_pipeline->vtx_base_sgpr)
vtx_base_sgpr = (cmd_buffer->state.graphics_pipeline->vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
if (cmd_buffer->state.graphics_pipeline->uses_drawid)
vtx_base_sgpr |= DGC_USES_DRAWID;
if (layout->draw_mesh_tasks) {
struct radv_shader *mesh_shader = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_MESH);
if (mesh_shader->info.cs.uses_grid_size)
vtx_base_sgpr |= DGC_USES_GRID_SIZE;
} else {
if (cmd_buffer->state.graphics_pipeline->uses_baseinstance)
vtx_base_sgpr |= DGC_USES_BASEINSTANCE;
}
params->draw_indexed = layout->indexed;
params->draw_params_offset = layout->draw_params_offset;
params->binds_index_buffer = layout->binds_index_buffer;
params->vtx_base_sgpr = vtx_base_sgpr;
params->max_index_count = cmd_buffer->state.max_index_count;
params->index_buffer_offset = layout->index_buffer_offset;
params->ibo_type_32 = layout->ibo_type_32;
params->ibo_type_8 = layout->ibo_type_8;
params->draw_mesh_tasks = layout->draw_mesh_tasks;
if (layout->bind_vbo_mask) {
uint32_t mask = vs->info.vs.vb_desc_usage_mask;
unsigned vb_desc_alloc_size = util_bitcount(mask) * 16;
radv_write_vertex_descriptors(cmd_buffer, graphics_pipeline, true, *upload_data);
uint32_t *vbo_info = (uint32_t *)((char *)*upload_data + vb_desc_alloc_size);
unsigned idx = 0;
while (mask) {
unsigned i = u_bit_scan(&mask);
unsigned binding = vs->info.vs.use_per_attribute_vb_descs ? graphics_pipeline->attrib_bindings[i] : i;
uint32_t attrib_end = graphics_pipeline->attrib_ends[i];
params->vbo_bind_mask |= ((layout->bind_vbo_mask >> binding) & 1u) << idx;
vbo_info[2 * idx] = ((vs->info.vs.use_per_attribute_vb_descs ? 1u : 0u) << 31) | layout->vbo_offsets[binding];
vbo_info[2 * idx + 1] = graphics_pipeline->attrib_index_offset[i] | (attrib_end << 16);
++idx;
}
params->vbo_cnt = idx;
params->vbo_reg =
((radv_get_user_sgpr(vs, AC_UD_VS_VERTEX_BUFFERS)->sgpr_idx * 4 + vs->info.user_data_0) - SI_SH_REG_OFFSET) >>
2;
*upload_data = (char *)*upload_data + vb_size;
}
}
static void
radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
unsigned *upload_size, unsigned *upload_offset, void **upload_data,
struct radv_dgc_params *params, bool cond_render_enabled)
{
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
*upload_size = MAX2(*upload_size, 16);
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
return;
}
params->dispatch_params_offset = layout->dispatch_params_offset;
params->dispatch_initiator = device->dispatch_initiator | S_00B800_FORCE_START_AT_000(1);
params->is_dispatch = 1;
if (cond_render_enabled) {
params->predicating = true;
params->predication_va = cmd_buffer->state.predication_va;
params->predication_type = cmd_buffer->state.predication_type;
}
if (pipeline) {
struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
struct radv_shader *cs = radv_get_shader(compute_pipeline->base.shaders, MESA_SHADER_COMPUTE);
if (cs->info.wave_size == 32) {
assert(pdev->info.gfx_level >= GFX10);
params->dispatch_initiator |= S_00B800_CS_W32_EN(1);
}
const struct radv_userdata_info *loc = radv_get_user_sgpr(cs, AC_UD_CS_GRID_SIZE);
if (loc->sgpr_idx != -1) {
params->grid_base_sgpr = (cs->info.user_data_0 + 4 * loc->sgpr_idx - SI_SH_REG_OFFSET) >> 2;
}
} else {
params->bind_pipeline = 1;
params->pipeline_params_offset = layout->pipeline_params_offset;
}
}
void
radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
bool cond_render_enabled)
{
VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
VK_FROM_HANDLE(radv_buffer, stream_buffer, pGeneratedCommandsInfo->pStreams[0].buffer);
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
struct radv_meta_saved_state saved_state;
unsigned upload_offset, upload_size;
struct radv_buffer token_buffer;
void *upload_data;
uint32_t cmd_stride, upload_stride;
radv_get_sequence_size(layout, pipeline, &cmd_stride, &upload_stride);
unsigned cmd_buf_size =
radv_align_cmdbuf_size(device, cmd_stride * pGeneratedCommandsInfo->sequencesCount, AMD_IP_GFX);
uint64_t upload_addr =
radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
uint64_t stream_addr =
radv_buffer_get_va(stream_buffer->bo) + stream_buffer->offset + pGeneratedCommandsInfo->pStreams[0].offset;
struct radv_dgc_params params = {
.cmd_buf_stride = cmd_stride,
.cmd_buf_size = cmd_buf_size,
.upload_addr = (uint32_t)upload_addr,
.upload_stride = upload_stride,
.sequence_count = pGeneratedCommandsInfo->sequencesCount,
.stream_stride = layout->input_stride,
.use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo),
.stream_addr = stream_addr,
};
upload_size = layout->push_constant_size + 16 * layout->dynamic_offset_count +
sizeof(layout->push_constant_offsets) + ARRAY_SIZE(pipeline->shaders) * 12;
if (!layout->push_constant_mask)
upload_size = 0;
if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
radv_prepare_dgc_graphics(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data,
&params);
} else {
assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, &params,
cond_render_enabled);
}
if (layout->push_constant_mask) {
uint32_t *desc = upload_data;
upload_data = (char *)upload_data + ARRAY_SIZE(pipeline->shaders) * 12;
unsigned idx = 0;
if (pipeline) {
for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) {
if (!pipeline->shaders[i])
continue;
const struct radv_shader *shader = pipeline->shaders[i];
const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0)
params.const_copy = 1;
if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 ||
locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
unsigned upload_sgpr = 0;
unsigned inline_sgpr = 0;
if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
upload_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx -
SI_SH_REG_OFFSET) >>
2;
}
if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
inline_sgpr = (shader->info.user_data_0 +
4 * locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx - SI_SH_REG_OFFSET) >>
2;
desc[idx * 3 + 1] = pipeline->shaders[i]->info.inline_push_constant_mask;
desc[idx * 3 + 2] = pipeline->shaders[i]->info.inline_push_constant_mask >> 32;
}
desc[idx * 3] = upload_sgpr | (inline_sgpr << 16);
++idx;
}
}
}
params.push_constant_shader_cnt = idx;
params.const_copy_size = layout->push_constant_size + 16 * layout->dynamic_offset_count;
params.push_constant_mask = layout->push_constant_mask;
memcpy(upload_data, layout->push_constant_offsets, sizeof(layout->push_constant_offsets));
upload_data = (char *)upload_data + sizeof(layout->push_constant_offsets);
memcpy(upload_data, cmd_buffer->push_constants, layout->push_constant_size);
upload_data = (char *)upload_data + layout->push_constant_size;
struct radv_descriptor_state *descriptors_state =
radv_get_descriptors_state(cmd_buffer, pGeneratedCommandsInfo->pipelineBindPoint);
memcpy(upload_data, descriptors_state->dynamic_buffers, 16 * layout->dynamic_offset_count);
upload_data = (char *)upload_data + 16 * layout->dynamic_offset_count;
}
radv_buffer_init(&token_buffer, device, cmd_buffer->upload.upload_bo, upload_size, upload_offset);
VkWriteDescriptorSet ds_writes[5];
VkDescriptorBufferInfo buf_info[ARRAY_SIZE(ds_writes)];
int ds_cnt = 0;
buf_info[ds_cnt] =
(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&token_buffer), .offset = 0, .range = upload_size};
ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstBinding = DGC_DESC_PARAMS,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.pBufferInfo = &buf_info[ds_cnt]};
++ds_cnt;
buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->preprocessBuffer,
.offset = pGeneratedCommandsInfo->preprocessOffset,
.range = pGeneratedCommandsInfo->preprocessSize};
ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstBinding = DGC_DESC_PREPARE,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.pBufferInfo = &buf_info[ds_cnt]};
++ds_cnt;
if (pGeneratedCommandsInfo->streamCount > 0) {
buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->pStreams[0].buffer,
.offset = pGeneratedCommandsInfo->pStreams[0].offset,
.range = VK_WHOLE_SIZE};
ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstBinding = DGC_DESC_STREAM,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.pBufferInfo = &buf_info[ds_cnt]};
++ds_cnt;
}
if (pGeneratedCommandsInfo->sequencesCountBuffer != VK_NULL_HANDLE) {
buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->sequencesCountBuffer,
.offset = pGeneratedCommandsInfo->sequencesCountOffset,
.range = VK_WHOLE_SIZE};
ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstBinding = DGC_DESC_COUNT,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.pBufferInfo = &buf_info[ds_cnt]};
++ds_cnt;
params.sequence_count |= 1u << 31;
}
radv_meta_save(&saved_state, cmd_buffer,
RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
device->meta_state.dgc_prepare.pipeline);
vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.dgc_prepare.p_layout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(params), &params);
radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, device->meta_state.dgc_prepare.p_layout, 0,
ds_cnt, ds_writes);
unsigned block_count = MAX2(1, DIV_ROUND_UP(pGeneratedCommandsInfo->sequencesCount, 64));
vk_common_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
radv_buffer_finish(&token_buffer);
radv_meta_restore(&saved_state, cmd_buffer);
}
/* VK_NV_device_generated_commands_compute */
VKAPI_ATTR void VKAPI_CALL
radv_GetPipelineIndirectMemoryRequirementsNV(VkDevice _device, const VkComputePipelineCreateInfo *pCreateInfo,
VkMemoryRequirements2 *pMemoryRequirements)
{
VkMemoryRequirements *reqs = &pMemoryRequirements->memoryRequirements;
const uint32_t size = sizeof(struct radv_compute_pipeline_metadata);
VK_FROM_HANDLE(radv_device, device, _device);
const struct radv_physical_device *pdev = radv_device_physical(device);
reqs->memoryTypeBits = ((1u << pdev->memory_properties.memoryTypeCount) - 1u) & ~pdev->memory_types_32bit;
reqs->alignment = 4;
reqs->size = align(size, reqs->alignment);
}
VKAPI_ATTR VkDeviceAddress VKAPI_CALL
radv_GetPipelineIndirectDeviceAddressNV(VkDevice device, const VkPipelineIndirectDeviceAddressInfoNV *pInfo)
{
VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline);
return radv_pipeline_to_compute(pipeline)->indirect.va;
}