radeonsi/gfx11: use SET_SH_REG_PAIRS_PACKED for compute by buffering reg writes

This is the compute portion of the work. It uses a separate buffer
for compute SH registers in si_context.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23517>
This commit is contained in:
Marek Olšák 2023-06-11 18:37:26 -04:00 committed by Marge Bot
parent 1753b321f8
commit 69bc1180b7
7 changed files with 112 additions and 39 deletions

View file

@ -125,6 +125,13 @@
sctx->buffered_gfx_sh_regs[__i / 2].reg_value[__i % 2] = value; \
} while (0)
#define radeon_push_compute_sh_reg(reg, value) do { \
unsigned __i = sctx->num_buffered_compute_sh_regs++; \
assert(__i / 2 < ARRAY_SIZE(sctx->buffered_compute_sh_regs)); \
sctx->buffered_compute_sh_regs[__i / 2].reg_offset[__i % 2] = ((reg) - SI_SH_REG_OFFSET) >> 2; \
sctx->buffered_compute_sh_regs[__i / 2].reg_value[__i % 2] = value; \
} while (0)
#define radeon_set_or_push_gfx_sh_reg(reg, value) do { \
if (GFX_VERSION >= GFX11) { \
radeon_push_gfx_sh_reg(reg, value); \
@ -144,6 +151,16 @@
} \
} while (0)
#define radeon_opt_push_compute_sh_reg(offset, reg, val) do { \
unsigned __value = val; \
if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \
sctx->tracked_regs.other_reg_value[reg] != __value) { \
radeon_push_compute_sh_reg(offset, __value); \
sctx->tracked_regs.other_reg_saved_mask |= BITFIELD64_BIT(reg); \
sctx->tracked_regs.other_reg_value[reg] = __value; \
} \
} while (0)
#define radeon_set_uconfig_reg_seq(reg, num, perfctr) do { \
assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \
radeon_emit(PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \

View file

@ -499,24 +499,24 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);
if (sctx->gfx_level >= GFX11) {
radeon_begin(cs);
radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
radeon_opt_set_sh_reg2(sctx, R_00B848_COMPUTE_PGM_RSRC1,
SI_TRACKED_COMPUTE_PGM_RSRC1,
config->rsrc1, rsrc2);
radeon_opt_set_sh_reg(sctx, R_00B8A0_COMPUTE_PGM_RSRC3,
SI_TRACKED_COMPUTE_PGM_RSRC3,
S_00B8A0_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));
radeon_opt_set_sh_reg(sctx, R_00B860_COMPUTE_TMPRING_SIZE,
SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size);
radeon_push_compute_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
radeon_opt_push_compute_sh_reg(R_00B848_COMPUTE_PGM_RSRC1,
SI_TRACKED_COMPUTE_PGM_RSRC1, config->rsrc1);
radeon_opt_push_compute_sh_reg(R_00B84C_COMPUTE_PGM_RSRC2,
SI_TRACKED_COMPUTE_PGM_RSRC2, rsrc2);
radeon_opt_push_compute_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3,
SI_TRACKED_COMPUTE_PGM_RSRC3,
S_00B8A0_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));
radeon_opt_push_compute_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size);
if (shader->scratch_bo) {
radeon_opt_set_sh_reg2(sctx, R_00B840_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
sctx->compute_scratch_buffer->gpu_address >> 8,
sctx->compute_scratch_buffer->gpu_address >> 40);
radeon_opt_push_compute_sh_reg(R_00B840_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
sctx->compute_scratch_buffer->gpu_address >> 8);
radeon_opt_push_compute_sh_reg(R_00B844_COMPUTE_DISPATCH_SCRATCH_BASE_HI,
SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_HI,
sctx->compute_scratch_buffer->gpu_address >> 40);
}
radeon_end();
} else {
radeon_begin(cs);
radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
@ -730,24 +730,39 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
}
radeon_begin_again(cs);
} else {
radeon_set_sh_reg_seq(grid_size_reg, 3);
radeon_emit(info->grid[0]);
radeon_emit(info->grid[1]);
radeon_emit(info->grid[2]);
if (sctx->gfx_level >= GFX11) {
radeon_push_compute_sh_reg(grid_size_reg, info->grid[0]);
radeon_push_compute_sh_reg(grid_size_reg + 4, info->grid[1]);
radeon_push_compute_sh_reg(grid_size_reg + 8, info->grid[2]);
} else {
radeon_set_sh_reg_seq(grid_size_reg, 3);
radeon_emit(info->grid[0]);
radeon_emit(info->grid[1]);
radeon_emit(info->grid[2]);
}
}
}
if (sel->info.uses_variable_block_size) {
uint32_t value = info->block[0] | (info->block[1] << 10) | (info->block[2] << 20);
radeon_set_sh_reg(block_size_reg, value);
if (sctx->gfx_level >= GFX11) {
radeon_push_compute_sh_reg(block_size_reg, value);
} else {
radeon_set_sh_reg(block_size_reg, value);
}
}
if (sel->info.base.cs.user_data_components_amd) {
unsigned num = sel->info.base.cs.user_data_components_amd;
radeon_set_sh_reg_seq(cs_user_data_reg, num);
radeon_emit_array(sctx->cs_user_data, num);
if (sctx->gfx_level >= GFX11) {
for (unsigned i = 0; i < num; i++)
radeon_push_compute_sh_reg(cs_user_data_reg + i * 4, sctx->cs_user_data[i]);
} else {
radeon_set_sh_reg_seq(cs_user_data_reg, num);
radeon_emit_array(sctx->cs_user_data, num);
}
}
radeon_end();
}
@ -777,9 +792,15 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
sctx->cs_max_waves_per_sh,
threadgroups_per_cu);
radeon_opt_set_sh_reg(sctx, R_00B854_COMPUTE_RESOURCE_LIMITS,
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
compute_resource_limits);
if (sctx->gfx_level >= GFX11) {
radeon_opt_push_compute_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS,
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
compute_resource_limits);
} else {
radeon_opt_set_sh_reg(sctx, R_00B854_COMPUTE_RESOURCE_LIMITS,
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
compute_resource_limits);
}
unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) |
/* If the KMD allows it (there is a KMD hw register for it),
@ -816,9 +837,24 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
num_threads[2] = S_00B824_NUM_THREAD_FULL(info->block[2]);
}
radeon_opt_set_sh_reg3(sctx, R_00B81C_COMPUTE_NUM_THREAD_X,
SI_TRACKED_COMPUTE_NUM_THREAD_X,
num_threads[0], num_threads[1], num_threads[2]);
if (sctx->gfx_level >= GFX11) {
radeon_opt_push_compute_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X,
SI_TRACKED_COMPUTE_NUM_THREAD_X, num_threads[0]);
radeon_opt_push_compute_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y,
SI_TRACKED_COMPUTE_NUM_THREAD_Y, num_threads[1]);
radeon_opt_push_compute_sh_reg(R_00B824_COMPUTE_NUM_THREAD_Z,
SI_TRACKED_COMPUTE_NUM_THREAD_Z, num_threads[2]);
} else {
radeon_opt_set_sh_reg3(sctx, R_00B81C_COMPUTE_NUM_THREAD_X,
SI_TRACKED_COMPUTE_NUM_THREAD_X,
num_threads[0], num_threads[1], num_threads[2]);
}
if (sctx->gfx_level >= GFX11) {
radeon_end();
gfx11_emit_buffered_compute_sh_regs(sctx);
radeon_begin_again(cs);
}
if (info->indirect) {
uint64_t base_va = si_resource(info->indirect)->gpu_address;

View file

@ -2155,17 +2155,17 @@ void si_shader_change_notify(struct si_context *sctx)
}
}
#define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base) do { \
#define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base, type) do { \
unsigned sh_reg_base = (sh_base); \
if (sh_reg_base) { \
unsigned mask = sctx->shader_pointers_dirty & (pointer_mask); \
\
if (sctx->gfx_level >= GFX11 && sh_reg_base != R_00B900_COMPUTE_USER_DATA_0) { \
if (sctx->gfx_level >= GFX11) { \
u_foreach_bit(i, mask) { \
struct si_descriptors *descs = &sctx->descriptors[i]; \
unsigned sh_reg = sh_reg_base + descs->shader_userdata_offset; \
\
radeon_push_gfx_sh_reg(sh_reg, descs->gpu_address); \
radeon_push_##type##_sh_reg(sh_reg, descs->gpu_address); \
} \
} else { \
while (mask) { \
@ -2231,15 +2231,15 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
radeon_begin(&sctx->gfx_cs);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
sh_base[PIPE_SHADER_VERTEX]);
sh_base[PIPE_SHADER_VERTEX], gfx);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
sh_base[PIPE_SHADER_TESS_EVAL]);
sh_base[PIPE_SHADER_TESS_EVAL], gfx);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT),
sh_base[PIPE_SHADER_FRAGMENT]);
sh_base[PIPE_SHADER_FRAGMENT], gfx);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL),
sh_base[PIPE_SHADER_TESS_CTRL]);
sh_base[PIPE_SHADER_TESS_CTRL], gfx);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
sh_base[PIPE_SHADER_GEOMETRY]);
sh_base[PIPE_SHADER_GEOMETRY], gfx);
if (sctx->gs_attribute_ring_pointer_dirty) {
assert(sctx->gfx_level >= GFX11);
@ -2266,11 +2266,16 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
radeon_begin(cs);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
R_00B900_COMPUTE_USER_DATA_0);
R_00B900_COMPUTE_USER_DATA_0, compute);
sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE);
if (sctx->compute_bindless_pointer_dirty) {
radeon_emit_one_32bit_pointer(sctx, &sctx->bindless_descriptors, base);
if (sctx->gfx_level >= GFX11) {
radeon_push_compute_sh_reg(base + sctx->bindless_descriptors.shader_userdata_offset,
sctx->bindless_descriptors.gpu_address);
} else {
radeon_emit_one_32bit_pointer(sctx, &sctx->bindless_descriptors, base);
}
sctx->compute_bindless_pointer_dirty = false;
}

View file

@ -546,7 +546,9 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
ctx->last_num_tcs_input_cp = -1;
assert(ctx->num_buffered_gfx_sh_regs == 0);
assert(ctx->num_buffered_compute_sh_regs == 0);
ctx->num_buffered_gfx_sh_regs = 0;
ctx->num_buffered_compute_sh_regs = 0;
if (ctx->scratch_buffer) {
si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);

View file

@ -1034,6 +1034,8 @@ struct si_context {
/* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_PACKED*. */
unsigned num_buffered_gfx_sh_regs;
struct si_sh_reg_pair buffered_gfx_sh_regs[32];
unsigned num_buffered_compute_sh_regs;
struct si_sh_reg_pair buffered_compute_sh_regs[32];
/* Atom declarations. */
struct si_framebuffer framebuffer;

View file

@ -624,6 +624,7 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex_elements *velems,
struct pipe_vertex_buffer *vb, unsigned element_index,
uint32_t *out);
void gfx11_emit_buffered_compute_sh_regs(struct si_context *sctx);
void si_init_draw_functions_GFX6(struct si_context *sctx);
void si_init_draw_functions_GFX7(struct si_context *sctx);
void si_init_draw_functions_GFX8(struct si_context *sctx);

View file

@ -1517,6 +1517,16 @@ gfx11_emit_buffered_sh_regs_inline(struct si_context *sctx, unsigned *num_regs,
radeon_end();
}
#if GFX_VER == 6 /* declare this function only once because there is only one variant. */
void gfx11_emit_buffered_compute_sh_regs(struct si_context *sctx)
{
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_compute_sh_regs,
sctx->buffered_compute_sh_regs);
}
#endif
#define EMIT_SQTT_END_DRAW \
do { \
if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt_enabled)) { \