mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-02 20:20:09 +01:00
radeonsi/gfx11: use SET_SH_REG_PAIRS_PACKED for compute by buffering reg writes
This is the compute portion of the work. It uses a separate buffer for compute SH registers in si_context. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23517>
This commit is contained in:
parent
1753b321f8
commit
69bc1180b7
7 changed files with 112 additions and 39 deletions
|
|
@ -125,6 +125,13 @@
|
|||
sctx->buffered_gfx_sh_regs[__i / 2].reg_value[__i % 2] = value; \
|
||||
} while (0)
|
||||
|
||||
#define radeon_push_compute_sh_reg(reg, value) do { \
|
||||
unsigned __i = sctx->num_buffered_compute_sh_regs++; \
|
||||
assert(__i / 2 < ARRAY_SIZE(sctx->buffered_compute_sh_regs)); \
|
||||
sctx->buffered_compute_sh_regs[__i / 2].reg_offset[__i % 2] = ((reg) - SI_SH_REG_OFFSET) >> 2; \
|
||||
sctx->buffered_compute_sh_regs[__i / 2].reg_value[__i % 2] = value; \
|
||||
} while (0)
|
||||
|
||||
#define radeon_set_or_push_gfx_sh_reg(reg, value) do { \
|
||||
if (GFX_VERSION >= GFX11) { \
|
||||
radeon_push_gfx_sh_reg(reg, value); \
|
||||
|
|
@ -144,6 +151,16 @@
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
#define radeon_opt_push_compute_sh_reg(offset, reg, val) do { \
|
||||
unsigned __value = val; \
|
||||
if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \
|
||||
sctx->tracked_regs.other_reg_value[reg] != __value) { \
|
||||
radeon_push_compute_sh_reg(offset, __value); \
|
||||
sctx->tracked_regs.other_reg_saved_mask |= BITFIELD64_BIT(reg); \
|
||||
sctx->tracked_regs.other_reg_value[reg] = __value; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define radeon_set_uconfig_reg_seq(reg, num, perfctr) do { \
|
||||
assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \
|
||||
radeon_emit(PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \
|
||||
|
|
|
|||
|
|
@ -499,24 +499,24 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
|
|||
RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);
|
||||
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
radeon_begin(cs);
|
||||
radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
|
||||
radeon_opt_set_sh_reg2(sctx, R_00B848_COMPUTE_PGM_RSRC1,
|
||||
SI_TRACKED_COMPUTE_PGM_RSRC1,
|
||||
config->rsrc1, rsrc2);
|
||||
radeon_opt_set_sh_reg(sctx, R_00B8A0_COMPUTE_PGM_RSRC3,
|
||||
SI_TRACKED_COMPUTE_PGM_RSRC3,
|
||||
S_00B8A0_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));
|
||||
radeon_opt_set_sh_reg(sctx, R_00B860_COMPUTE_TMPRING_SIZE,
|
||||
SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size);
|
||||
|
||||
radeon_push_compute_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
|
||||
radeon_opt_push_compute_sh_reg(R_00B848_COMPUTE_PGM_RSRC1,
|
||||
SI_TRACKED_COMPUTE_PGM_RSRC1, config->rsrc1);
|
||||
radeon_opt_push_compute_sh_reg(R_00B84C_COMPUTE_PGM_RSRC2,
|
||||
SI_TRACKED_COMPUTE_PGM_RSRC2, rsrc2);
|
||||
radeon_opt_push_compute_sh_reg(R_00B8A0_COMPUTE_PGM_RSRC3,
|
||||
SI_TRACKED_COMPUTE_PGM_RSRC3,
|
||||
S_00B8A0_INST_PREF_SIZE(si_get_shader_prefetch_size(shader)));
|
||||
radeon_opt_push_compute_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
|
||||
SI_TRACKED_COMPUTE_TMPRING_SIZE, tmpring_size);
|
||||
if (shader->scratch_bo) {
|
||||
radeon_opt_set_sh_reg2(sctx, R_00B840_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
|
||||
SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
|
||||
sctx->compute_scratch_buffer->gpu_address >> 8,
|
||||
sctx->compute_scratch_buffer->gpu_address >> 40);
|
||||
radeon_opt_push_compute_sh_reg(R_00B840_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
|
||||
SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_LO,
|
||||
sctx->compute_scratch_buffer->gpu_address >> 8);
|
||||
radeon_opt_push_compute_sh_reg(R_00B844_COMPUTE_DISPATCH_SCRATCH_BASE_HI,
|
||||
SI_TRACKED_COMPUTE_DISPATCH_SCRATCH_BASE_HI,
|
||||
sctx->compute_scratch_buffer->gpu_address >> 40);
|
||||
}
|
||||
radeon_end();
|
||||
} else {
|
||||
radeon_begin(cs);
|
||||
radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
|
||||
|
|
@ -730,24 +730,39 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
|
|||
}
|
||||
radeon_begin_again(cs);
|
||||
} else {
|
||||
radeon_set_sh_reg_seq(grid_size_reg, 3);
|
||||
radeon_emit(info->grid[0]);
|
||||
radeon_emit(info->grid[1]);
|
||||
radeon_emit(info->grid[2]);
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
radeon_push_compute_sh_reg(grid_size_reg, info->grid[0]);
|
||||
radeon_push_compute_sh_reg(grid_size_reg + 4, info->grid[1]);
|
||||
radeon_push_compute_sh_reg(grid_size_reg + 8, info->grid[2]);
|
||||
} else {
|
||||
radeon_set_sh_reg_seq(grid_size_reg, 3);
|
||||
radeon_emit(info->grid[0]);
|
||||
radeon_emit(info->grid[1]);
|
||||
radeon_emit(info->grid[2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sel->info.uses_variable_block_size) {
|
||||
uint32_t value = info->block[0] | (info->block[1] << 10) | (info->block[2] << 20);
|
||||
|
||||
radeon_set_sh_reg(block_size_reg, value);
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
radeon_push_compute_sh_reg(block_size_reg, value);
|
||||
} else {
|
||||
radeon_set_sh_reg(block_size_reg, value);
|
||||
}
|
||||
}
|
||||
|
||||
if (sel->info.base.cs.user_data_components_amd) {
|
||||
unsigned num = sel->info.base.cs.user_data_components_amd;
|
||||
|
||||
radeon_set_sh_reg_seq(cs_user_data_reg, num);
|
||||
radeon_emit_array(sctx->cs_user_data, num);
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
for (unsigned i = 0; i < num; i++)
|
||||
radeon_push_compute_sh_reg(cs_user_data_reg + i * 4, sctx->cs_user_data[i]);
|
||||
} else {
|
||||
radeon_set_sh_reg_seq(cs_user_data_reg, num);
|
||||
radeon_emit_array(sctx->cs_user_data, num);
|
||||
}
|
||||
}
|
||||
radeon_end();
|
||||
}
|
||||
|
|
@ -777,9 +792,15 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
|||
sctx->cs_max_waves_per_sh,
|
||||
threadgroups_per_cu);
|
||||
|
||||
radeon_opt_set_sh_reg(sctx, R_00B854_COMPUTE_RESOURCE_LIMITS,
|
||||
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
|
||||
compute_resource_limits);
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
radeon_opt_push_compute_sh_reg(R_00B854_COMPUTE_RESOURCE_LIMITS,
|
||||
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
|
||||
compute_resource_limits);
|
||||
} else {
|
||||
radeon_opt_set_sh_reg(sctx, R_00B854_COMPUTE_RESOURCE_LIMITS,
|
||||
SI_TRACKED_COMPUTE_RESOURCE_LIMITS,
|
||||
compute_resource_limits);
|
||||
}
|
||||
|
||||
unsigned dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_FORCE_START_AT_000(1) |
|
||||
/* If the KMD allows it (there is a KMD hw register for it),
|
||||
|
|
@ -816,9 +837,24 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
|
|||
num_threads[2] = S_00B824_NUM_THREAD_FULL(info->block[2]);
|
||||
}
|
||||
|
||||
radeon_opt_set_sh_reg3(sctx, R_00B81C_COMPUTE_NUM_THREAD_X,
|
||||
SI_TRACKED_COMPUTE_NUM_THREAD_X,
|
||||
num_threads[0], num_threads[1], num_threads[2]);
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
radeon_opt_push_compute_sh_reg(R_00B81C_COMPUTE_NUM_THREAD_X,
|
||||
SI_TRACKED_COMPUTE_NUM_THREAD_X, num_threads[0]);
|
||||
radeon_opt_push_compute_sh_reg(R_00B820_COMPUTE_NUM_THREAD_Y,
|
||||
SI_TRACKED_COMPUTE_NUM_THREAD_Y, num_threads[1]);
|
||||
radeon_opt_push_compute_sh_reg(R_00B824_COMPUTE_NUM_THREAD_Z,
|
||||
SI_TRACKED_COMPUTE_NUM_THREAD_Z, num_threads[2]);
|
||||
} else {
|
||||
radeon_opt_set_sh_reg3(sctx, R_00B81C_COMPUTE_NUM_THREAD_X,
|
||||
SI_TRACKED_COMPUTE_NUM_THREAD_X,
|
||||
num_threads[0], num_threads[1], num_threads[2]);
|
||||
}
|
||||
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
radeon_end();
|
||||
gfx11_emit_buffered_compute_sh_regs(sctx);
|
||||
radeon_begin_again(cs);
|
||||
}
|
||||
|
||||
if (info->indirect) {
|
||||
uint64_t base_va = si_resource(info->indirect)->gpu_address;
|
||||
|
|
|
|||
|
|
@ -2155,17 +2155,17 @@ void si_shader_change_notify(struct si_context *sctx)
|
|||
}
|
||||
}
|
||||
|
||||
#define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base) do { \
|
||||
#define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base, type) do { \
|
||||
unsigned sh_reg_base = (sh_base); \
|
||||
if (sh_reg_base) { \
|
||||
unsigned mask = sctx->shader_pointers_dirty & (pointer_mask); \
|
||||
\
|
||||
if (sctx->gfx_level >= GFX11 && sh_reg_base != R_00B900_COMPUTE_USER_DATA_0) { \
|
||||
if (sctx->gfx_level >= GFX11) { \
|
||||
u_foreach_bit(i, mask) { \
|
||||
struct si_descriptors *descs = &sctx->descriptors[i]; \
|
||||
unsigned sh_reg = sh_reg_base + descs->shader_userdata_offset; \
|
||||
\
|
||||
radeon_push_gfx_sh_reg(sh_reg, descs->gpu_address); \
|
||||
radeon_push_##type##_sh_reg(sh_reg, descs->gpu_address); \
|
||||
} \
|
||||
} else { \
|
||||
while (mask) { \
|
||||
|
|
@ -2231,15 +2231,15 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
|
|||
|
||||
radeon_begin(&sctx->gfx_cs);
|
||||
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
|
||||
sh_base[PIPE_SHADER_VERTEX]);
|
||||
sh_base[PIPE_SHADER_VERTEX], gfx);
|
||||
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
|
||||
sh_base[PIPE_SHADER_TESS_EVAL]);
|
||||
sh_base[PIPE_SHADER_TESS_EVAL], gfx);
|
||||
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(FRAGMENT),
|
||||
sh_base[PIPE_SHADER_FRAGMENT]);
|
||||
sh_base[PIPE_SHADER_FRAGMENT], gfx);
|
||||
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_CTRL),
|
||||
sh_base[PIPE_SHADER_TESS_CTRL]);
|
||||
sh_base[PIPE_SHADER_TESS_CTRL], gfx);
|
||||
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY),
|
||||
sh_base[PIPE_SHADER_GEOMETRY]);
|
||||
sh_base[PIPE_SHADER_GEOMETRY], gfx);
|
||||
|
||||
if (sctx->gs_attribute_ring_pointer_dirty) {
|
||||
assert(sctx->gfx_level >= GFX11);
|
||||
|
|
@ -2266,11 +2266,16 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
|
|||
|
||||
radeon_begin(cs);
|
||||
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
|
||||
R_00B900_COMPUTE_USER_DATA_0);
|
||||
R_00B900_COMPUTE_USER_DATA_0, compute);
|
||||
sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE);
|
||||
|
||||
if (sctx->compute_bindless_pointer_dirty) {
|
||||
radeon_emit_one_32bit_pointer(sctx, &sctx->bindless_descriptors, base);
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
radeon_push_compute_sh_reg(base + sctx->bindless_descriptors.shader_userdata_offset,
|
||||
sctx->bindless_descriptors.gpu_address);
|
||||
} else {
|
||||
radeon_emit_one_32bit_pointer(sctx, &sctx->bindless_descriptors, base);
|
||||
}
|
||||
sctx->compute_bindless_pointer_dirty = false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -546,7 +546,9 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
|
|||
ctx->last_num_tcs_input_cp = -1;
|
||||
|
||||
assert(ctx->num_buffered_gfx_sh_regs == 0);
|
||||
assert(ctx->num_buffered_compute_sh_regs == 0);
|
||||
ctx->num_buffered_gfx_sh_regs = 0;
|
||||
ctx->num_buffered_compute_sh_regs = 0;
|
||||
|
||||
if (ctx->scratch_buffer) {
|
||||
si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
|
||||
|
|
|
|||
|
|
@ -1034,6 +1034,8 @@ struct si_context {
|
|||
/* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_PACKED*. */
|
||||
unsigned num_buffered_gfx_sh_regs;
|
||||
struct si_sh_reg_pair buffered_gfx_sh_regs[32];
|
||||
unsigned num_buffered_compute_sh_regs;
|
||||
struct si_sh_reg_pair buffered_compute_sh_regs[32];
|
||||
|
||||
/* Atom declarations. */
|
||||
struct si_framebuffer framebuffer;
|
||||
|
|
|
|||
|
|
@ -624,6 +624,7 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
|
|||
void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex_elements *velems,
|
||||
struct pipe_vertex_buffer *vb, unsigned element_index,
|
||||
uint32_t *out);
|
||||
void gfx11_emit_buffered_compute_sh_regs(struct si_context *sctx);
|
||||
void si_init_draw_functions_GFX6(struct si_context *sctx);
|
||||
void si_init_draw_functions_GFX7(struct si_context *sctx);
|
||||
void si_init_draw_functions_GFX8(struct si_context *sctx);
|
||||
|
|
|
|||
|
|
@ -1517,6 +1517,16 @@ gfx11_emit_buffered_sh_regs_inline(struct si_context *sctx, unsigned *num_regs,
|
|||
radeon_end();
|
||||
}
|
||||
|
||||
#if GFX_VER == 6 /* declare this function only once because there is only one variant. */
|
||||
|
||||
void gfx11_emit_buffered_compute_sh_regs(struct si_context *sctx)
|
||||
{
|
||||
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_compute_sh_regs,
|
||||
sctx->buffered_compute_sh_regs);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define EMIT_SQTT_END_DRAW \
|
||||
do { \
|
||||
if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt_enabled)) { \
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue