mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-31 18:30:09 +01:00
radeonsi: "create_dma_compute" shader in nir
Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25782>
This commit is contained in:
parent
d54c140eb6
commit
6584088cd5
5 changed files with 86 additions and 110 deletions
|
|
@ -395,7 +395,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
|
|||
|
||||
if (!sctx->cs_copy_buffer) {
|
||||
sctx->cs_copy_buffer = si_create_dma_compute_shader(
|
||||
&sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
|
||||
sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
|
||||
}
|
||||
|
||||
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
|
||||
|
|
@ -409,7 +409,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
|
|||
|
||||
if (!sctx->cs_clear_buffer) {
|
||||
sctx->cs_clear_buffer = si_create_dma_compute_shader(
|
||||
&sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
|
||||
sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
|
||||
}
|
||||
|
||||
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
|
||||
|
|
|
|||
|
|
@ -1650,7 +1650,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
|
|||
/* si_shaderlib_tgsi.c */
|
||||
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
||||
unsigned num_layers);
|
||||
void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
|
||||
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||
bool dst_stream_cache_policy, bool is_copy);
|
||||
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
|
||||
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
|
||||
|
|
|
|||
|
|
@ -645,3 +645,85 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
|
|||
|
||||
return create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
||||
/* Create a compute shader implementing clear_buffer or copy_buffer. */
|
||||
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||
bool dst_stream_cache_policy, bool is_copy)
|
||||
{
|
||||
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
|
||||
|
||||
const nir_shader_compiler_options *options =
|
||||
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
|
||||
|
||||
unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
|
||||
if (dst_stream_cache_policy)
|
||||
store_qualifier |= ACCESS_NON_TEMPORAL;
|
||||
|
||||
/* Don't cache loads, because there is no reuse. */
|
||||
unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
|
||||
|
||||
nir_builder b =
|
||||
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_dma_compute");
|
||||
|
||||
unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
|
||||
|
||||
b.shader->info.workgroup_size[0] = default_wave_size;
|
||||
b.shader->info.workgroup_size[1] = 1;
|
||||
b.shader->info.workgroup_size[2] = 1;
|
||||
b.shader->info.num_ssbos = 1;
|
||||
|
||||
unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
|
||||
unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
|
||||
|
||||
for (unsigned i = 0; i < num_mem_ops; i++) {
|
||||
if (i * 4 < num_dwords_per_thread)
|
||||
inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
|
||||
}
|
||||
|
||||
/* If there are multiple stores,
|
||||
* the first store writes into 0 * wavesize + tid,
|
||||
* the 2nd store writes into 1 * wavesize + tid,
|
||||
* the 3rd store writes into 2 * wavesize + tid, etc.
|
||||
*/
|
||||
nir_def *store_address = get_global_ids(&b, 1);
|
||||
|
||||
/* Convert from a "store size unit" into bytes. */
|
||||
store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]);
|
||||
|
||||
nir_def *load_address = store_address, *value, *values[num_mem_ops];
|
||||
value = nir_undef(&b, 1, 32);
|
||||
|
||||
if (is_copy) {
|
||||
b.shader->info.num_ssbos++;
|
||||
} else {
|
||||
b.shader->info.cs.user_data_components_amd = inst_dwords[0];
|
||||
value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]);
|
||||
}
|
||||
|
||||
/* Distance between a load and a store for latency hiding. */
|
||||
unsigned load_store_distance = is_copy ? 8 : 0;
|
||||
|
||||
for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
|
||||
int d = i - load_store_distance;
|
||||
|
||||
if (is_copy && i < num_mem_ops) {
|
||||
if (i) {
|
||||
load_address = nir_iadd(&b, load_address,
|
||||
nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
|
||||
}
|
||||
values[i] = nir_load_ssbo(&b, 4, 32, nir_imm_int(&b, 1),load_address,
|
||||
.access = load_qualifier);
|
||||
}
|
||||
|
||||
if (d >= 0) {
|
||||
if (d) {
|
||||
store_address = nir_iadd(&b, store_address,
|
||||
nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
|
||||
}
|
||||
nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
|
||||
.access = store_qualifier);
|
||||
}
|
||||
}
|
||||
|
||||
return create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -67,112 +67,6 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
|||
return *vs;
|
||||
}
|
||||
|
||||
/* Create a compute shader implementing clear_buffer or copy_buffer. */
|
||||
void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
|
||||
bool dst_stream_cache_policy, bool is_copy)
|
||||
{
|
||||
struct si_screen *sscreen = (struct si_screen *)ctx->screen;
|
||||
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
|
||||
|
||||
unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
|
||||
if (dst_stream_cache_policy)
|
||||
store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
|
||||
|
||||
/* Don't cache loads, because there is no reuse. */
|
||||
unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
|
||||
|
||||
unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
|
||||
unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
|
||||
|
||||
for (unsigned i = 0; i < num_mem_ops; i++) {
|
||||
if (i * 4 < num_dwords_per_thread)
|
||||
inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
|
||||
}
|
||||
|
||||
struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
|
||||
if (!ureg)
|
||||
return NULL;
|
||||
|
||||
unsigned default_wave_size = si_determine_wave_size(sscreen, NULL);
|
||||
|
||||
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, default_wave_size);
|
||||
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
|
||||
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
|
||||
|
||||
struct ureg_src value;
|
||||
if (!is_copy) {
|
||||
ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
|
||||
value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
|
||||
}
|
||||
|
||||
struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
|
||||
struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
|
||||
struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
|
||||
struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
|
||||
struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
|
||||
struct ureg_src srcbuf;
|
||||
struct ureg_src *values = NULL;
|
||||
|
||||
if (is_copy) {
|
||||
srcbuf = ureg_DECL_buffer(ureg, 1, false);
|
||||
values = malloc(num_mem_ops * sizeof(struct ureg_src));
|
||||
}
|
||||
|
||||
/* If there are multiple stores, the first store writes into 0*wavesize+tid,
|
||||
* the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
|
||||
*/
|
||||
ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, default_wave_size * num_mem_ops),
|
||||
tid);
|
||||
/* Convert from a "store size unit" into bytes. */
|
||||
ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0]));
|
||||
ureg_MOV(ureg, load_addr, ureg_src(store_addr));
|
||||
|
||||
/* Distance between a load and a store for latency hiding. */
|
||||
unsigned load_store_distance = is_copy ? 8 : 0;
|
||||
|
||||
for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
|
||||
int d = i - load_store_distance;
|
||||
|
||||
if (is_copy && i < num_mem_ops) {
|
||||
if (i) {
|
||||
ureg_UADD(ureg, load_addr, ureg_src(load_addr),
|
||||
ureg_imm1u(ureg, 4 * inst_dwords[i] * default_wave_size));
|
||||
}
|
||||
|
||||
values[i] = ureg_src(ureg_DECL_temporary(ureg));
|
||||
struct ureg_dst dst =
|
||||
ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i]));
|
||||
struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
|
||||
ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier,
|
||||
TGSI_TEXTURE_BUFFER, 0);
|
||||
}
|
||||
|
||||
if (d >= 0) {
|
||||
if (d) {
|
||||
ureg_UADD(ureg, store_addr, ureg_src(store_addr),
|
||||
ureg_imm1u(ureg, 4 * inst_dwords[d] * default_wave_size));
|
||||
}
|
||||
|
||||
struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
|
||||
struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value};
|
||||
ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier,
|
||||
TGSI_TEXTURE_BUFFER, 0);
|
||||
}
|
||||
}
|
||||
ureg_END(ureg);
|
||||
|
||||
struct pipe_compute_state state = {};
|
||||
state.ir_type = PIPE_SHADER_IR_TGSI;
|
||||
state.prog = ureg_get_tokens(ureg, NULL);
|
||||
|
||||
void *cs = ctx->create_compute_state(ctx, &state);
|
||||
ureg_destroy(ureg);
|
||||
ureg_free_tokens(state.prog);
|
||||
|
||||
free(values);
|
||||
return cs;
|
||||
}
|
||||
|
||||
/* Create the compute shader that is used to collect the results.
|
||||
*
|
||||
* One compute grid with a single thread is launched for every query result
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
|||
|
||||
void *compute_shader = NULL;
|
||||
if (test_cs) {
|
||||
compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
|
||||
compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
|
||||
cache_policy == L2_STREAM, is_copy);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue