radeonsi: "create_dma_compute" shader in nir

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25782>
This commit is contained in:
Ganesh Belgur Ramachandra 2023-10-05 06:49:58 -05:00 committed by Marge Bot
parent d54c140eb6
commit 6584088cd5
5 changed files with 86 additions and 110 deletions

View file

@ -395,7 +395,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
if (!sctx->cs_copy_buffer) {
sctx->cs_copy_buffer = si_create_dma_compute_shader(
&sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
}
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
@ -409,7 +409,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
if (!sctx->cs_clear_buffer) {
sctx->cs_clear_buffer = si_create_dma_compute_shader(
&sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
}
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,

View file

@ -1650,7 +1650,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
/* si_shaderlib_tgsi.c */
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
unsigned num_layers);
void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy);
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);

View file

@ -645,3 +645,85 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
return create_shader_state(sctx, b.shader);
}
/* Create a compute shader implementing clear_buffer or copy_buffer. */
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy)
{
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
const nir_shader_compiler_options *options =
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
if (dst_stream_cache_policy)
store_qualifier |= ACCESS_NON_TEMPORAL;
/* Don't cache loads, because there is no reuse. */
unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
nir_builder b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_dma_compute");
unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
b.shader->info.workgroup_size[0] = default_wave_size;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
b.shader->info.num_ssbos = 1;
unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
for (unsigned i = 0; i < num_mem_ops; i++) {
if (i * 4 < num_dwords_per_thread)
inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
}
/* If there are multiple stores,
* the first store writes into 0 * wavesize + tid,
* the 2nd store writes into 1 * wavesize + tid,
* the 3rd store writes into 2 * wavesize + tid, etc.
*/
nir_def *store_address = get_global_ids(&b, 1);
/* Convert from a "store size unit" into bytes. */
store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]);
nir_def *load_address = store_address, *value, *values[num_mem_ops];
value = nir_undef(&b, 1, 32);
if (is_copy) {
b.shader->info.num_ssbos++;
} else {
b.shader->info.cs.user_data_components_amd = inst_dwords[0];
value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]);
}
/* Distance between a load and a store for latency hiding. */
unsigned load_store_distance = is_copy ? 8 : 0;
for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
int d = i - load_store_distance;
if (is_copy && i < num_mem_ops) {
if (i) {
load_address = nir_iadd(&b, load_address,
nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
}
values[i] = nir_load_ssbo(&b, 4, 32, nir_imm_int(&b, 1),load_address,
.access = load_qualifier);
}
if (d >= 0) {
if (d) {
store_address = nir_iadd(&b, store_address,
nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
}
nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
.access = store_qualifier);
}
}
return create_shader_state(sctx, b.shader);
}

View file

@ -67,112 +67,6 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
return *vs;
}
/* Create a compute shader implementing clear_buffer or copy_buffer. */
void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy)
{
struct si_screen *sscreen = (struct si_screen *)ctx->screen;
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
if (dst_stream_cache_policy)
store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
/* Don't cache loads, because there is no reuse. */
unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
for (unsigned i = 0; i < num_mem_ops; i++) {
if (i * 4 < num_dwords_per_thread)
inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
}
struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
if (!ureg)
return NULL;
unsigned default_wave_size = si_determine_wave_size(sscreen, NULL);
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, default_wave_size);
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
struct ureg_src value;
if (!is_copy) {
ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
}
struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
struct ureg_src srcbuf;
struct ureg_src *values = NULL;
if (is_copy) {
srcbuf = ureg_DECL_buffer(ureg, 1, false);
values = malloc(num_mem_ops * sizeof(struct ureg_src));
}
/* If there are multiple stores, the first store writes into 0*wavesize+tid,
* the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
*/
ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, default_wave_size * num_mem_ops),
tid);
/* Convert from a "store size unit" into bytes. */
ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0]));
ureg_MOV(ureg, load_addr, ureg_src(store_addr));
/* Distance between a load and a store for latency hiding. */
unsigned load_store_distance = is_copy ? 8 : 0;
for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
int d = i - load_store_distance;
if (is_copy && i < num_mem_ops) {
if (i) {
ureg_UADD(ureg, load_addr, ureg_src(load_addr),
ureg_imm1u(ureg, 4 * inst_dwords[i] * default_wave_size));
}
values[i] = ureg_src(ureg_DECL_temporary(ureg));
struct ureg_dst dst =
ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i]));
struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier,
TGSI_TEXTURE_BUFFER, 0);
}
if (d >= 0) {
if (d) {
ureg_UADD(ureg, store_addr, ureg_src(store_addr),
ureg_imm1u(ureg, 4 * inst_dwords[d] * default_wave_size));
}
struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value};
ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier,
TGSI_TEXTURE_BUFFER, 0);
}
}
ureg_END(ureg);
struct pipe_compute_state state = {};
state.ir_type = PIPE_SHADER_IR_TGSI;
state.prog = ureg_get_tokens(ureg, NULL);
void *cs = ctx->create_compute_state(ctx, &state);
ureg_destroy(ureg);
ureg_free_tokens(state.prog);
free(values);
return cs;
}
/* Create the compute shader that is used to collect the results.
*
* One compute grid with a single thread is launched for every query result

View file

@ -119,7 +119,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
void *compute_shader = NULL;
if (test_cs) {
compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
cache_policy == L2_STREAM, is_copy);
}