mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-17 19:38:21 +02:00
radeonsi: "create_dma_compute" shader in nir
Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25782>
This commit is contained in:
parent
d54c140eb6
commit
6584088cd5
5 changed files with 86 additions and 110 deletions
|
|
@ -395,7 +395,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
|
||||||
|
|
||||||
if (!sctx->cs_copy_buffer) {
|
if (!sctx->cs_copy_buffer) {
|
||||||
sctx->cs_copy_buffer = si_create_dma_compute_shader(
|
sctx->cs_copy_buffer = si_create_dma_compute_shader(
|
||||||
&sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
|
sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
|
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
|
||||||
|
|
@ -409,7 +409,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
|
||||||
|
|
||||||
if (!sctx->cs_clear_buffer) {
|
if (!sctx->cs_clear_buffer) {
|
||||||
sctx->cs_clear_buffer = si_create_dma_compute_shader(
|
sctx->cs_clear_buffer = si_create_dma_compute_shader(
|
||||||
&sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
|
sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
|
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
|
||||||
|
|
|
||||||
|
|
@ -1650,7 +1650,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
|
||||||
/* si_shaderlib_tgsi.c */
|
/* si_shaderlib_tgsi.c */
|
||||||
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
||||||
unsigned num_layers);
|
unsigned num_layers);
|
||||||
void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
|
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||||
bool dst_stream_cache_policy, bool is_copy);
|
bool dst_stream_cache_policy, bool is_copy);
|
||||||
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
|
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
|
||||||
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
|
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
|
||||||
|
|
|
||||||
|
|
@ -645,3 +645,85 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
|
||||||
|
|
||||||
return create_shader_state(sctx, b.shader);
|
return create_shader_state(sctx, b.shader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Create a compute shader implementing clear_buffer or copy_buffer. */
|
||||||
|
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||||
|
bool dst_stream_cache_policy, bool is_copy)
|
||||||
|
{
|
||||||
|
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
|
||||||
|
|
||||||
|
const nir_shader_compiler_options *options =
|
||||||
|
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
|
||||||
|
|
||||||
|
unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
|
||||||
|
if (dst_stream_cache_policy)
|
||||||
|
store_qualifier |= ACCESS_NON_TEMPORAL;
|
||||||
|
|
||||||
|
/* Don't cache loads, because there is no reuse. */
|
||||||
|
unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
|
||||||
|
|
||||||
|
nir_builder b =
|
||||||
|
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_dma_compute");
|
||||||
|
|
||||||
|
unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
|
||||||
|
|
||||||
|
b.shader->info.workgroup_size[0] = default_wave_size;
|
||||||
|
b.shader->info.workgroup_size[1] = 1;
|
||||||
|
b.shader->info.workgroup_size[2] = 1;
|
||||||
|
b.shader->info.num_ssbos = 1;
|
||||||
|
|
||||||
|
unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
|
||||||
|
unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < num_mem_ops; i++) {
|
||||||
|
if (i * 4 < num_dwords_per_thread)
|
||||||
|
inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If there are multiple stores,
|
||||||
|
* the first store writes into 0 * wavesize + tid,
|
||||||
|
* the 2nd store writes into 1 * wavesize + tid,
|
||||||
|
* the 3rd store writes into 2 * wavesize + tid, etc.
|
||||||
|
*/
|
||||||
|
nir_def *store_address = get_global_ids(&b, 1);
|
||||||
|
|
||||||
|
/* Convert from a "store size unit" into bytes. */
|
||||||
|
store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]);
|
||||||
|
|
||||||
|
nir_def *load_address = store_address, *value, *values[num_mem_ops];
|
||||||
|
value = nir_undef(&b, 1, 32);
|
||||||
|
|
||||||
|
if (is_copy) {
|
||||||
|
b.shader->info.num_ssbos++;
|
||||||
|
} else {
|
||||||
|
b.shader->info.cs.user_data_components_amd = inst_dwords[0];
|
||||||
|
value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Distance between a load and a store for latency hiding. */
|
||||||
|
unsigned load_store_distance = is_copy ? 8 : 0;
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
|
||||||
|
int d = i - load_store_distance;
|
||||||
|
|
||||||
|
if (is_copy && i < num_mem_ops) {
|
||||||
|
if (i) {
|
||||||
|
load_address = nir_iadd(&b, load_address,
|
||||||
|
nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
|
||||||
|
}
|
||||||
|
values[i] = nir_load_ssbo(&b, 4, 32, nir_imm_int(&b, 1),load_address,
|
||||||
|
.access = load_qualifier);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (d >= 0) {
|
||||||
|
if (d) {
|
||||||
|
store_address = nir_iadd(&b, store_address,
|
||||||
|
nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
|
||||||
|
}
|
||||||
|
nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
|
||||||
|
.access = store_qualifier);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return create_shader_state(sctx, b.shader);
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -67,112 +67,6 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
||||||
return *vs;
|
return *vs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Create a compute shader implementing clear_buffer or copy_buffer. */
|
|
||||||
void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
|
|
||||||
bool dst_stream_cache_policy, bool is_copy)
|
|
||||||
{
|
|
||||||
struct si_screen *sscreen = (struct si_screen *)ctx->screen;
|
|
||||||
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
|
|
||||||
|
|
||||||
unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
|
|
||||||
if (dst_stream_cache_policy)
|
|
||||||
store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
|
|
||||||
|
|
||||||
/* Don't cache loads, because there is no reuse. */
|
|
||||||
unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
|
|
||||||
|
|
||||||
unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
|
|
||||||
unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < num_mem_ops; i++) {
|
|
||||||
if (i * 4 < num_dwords_per_thread)
|
|
||||||
inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
|
|
||||||
if (!ureg)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
unsigned default_wave_size = si_determine_wave_size(sscreen, NULL);
|
|
||||||
|
|
||||||
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, default_wave_size);
|
|
||||||
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
|
|
||||||
ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
|
|
||||||
|
|
||||||
struct ureg_src value;
|
|
||||||
if (!is_copy) {
|
|
||||||
ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]);
|
|
||||||
value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
|
|
||||||
struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
|
|
||||||
struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
|
|
||||||
struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
|
|
||||||
struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
|
|
||||||
struct ureg_src srcbuf;
|
|
||||||
struct ureg_src *values = NULL;
|
|
||||||
|
|
||||||
if (is_copy) {
|
|
||||||
srcbuf = ureg_DECL_buffer(ureg, 1, false);
|
|
||||||
values = malloc(num_mem_ops * sizeof(struct ureg_src));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If there are multiple stores, the first store writes into 0*wavesize+tid,
|
|
||||||
* the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc.
|
|
||||||
*/
|
|
||||||
ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, default_wave_size * num_mem_ops),
|
|
||||||
tid);
|
|
||||||
/* Convert from a "store size unit" into bytes. */
|
|
||||||
ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0]));
|
|
||||||
ureg_MOV(ureg, load_addr, ureg_src(store_addr));
|
|
||||||
|
|
||||||
/* Distance between a load and a store for latency hiding. */
|
|
||||||
unsigned load_store_distance = is_copy ? 8 : 0;
|
|
||||||
|
|
||||||
for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
|
|
||||||
int d = i - load_store_distance;
|
|
||||||
|
|
||||||
if (is_copy && i < num_mem_ops) {
|
|
||||||
if (i) {
|
|
||||||
ureg_UADD(ureg, load_addr, ureg_src(load_addr),
|
|
||||||
ureg_imm1u(ureg, 4 * inst_dwords[i] * default_wave_size));
|
|
||||||
}
|
|
||||||
|
|
||||||
values[i] = ureg_src(ureg_DECL_temporary(ureg));
|
|
||||||
struct ureg_dst dst =
|
|
||||||
ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i]));
|
|
||||||
struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
|
|
||||||
ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier,
|
|
||||||
TGSI_TEXTURE_BUFFER, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (d >= 0) {
|
|
||||||
if (d) {
|
|
||||||
ureg_UADD(ureg, store_addr, ureg_src(store_addr),
|
|
||||||
ureg_imm1u(ureg, 4 * inst_dwords[d] * default_wave_size));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
|
|
||||||
struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value};
|
|
||||||
ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier,
|
|
||||||
TGSI_TEXTURE_BUFFER, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ureg_END(ureg);
|
|
||||||
|
|
||||||
struct pipe_compute_state state = {};
|
|
||||||
state.ir_type = PIPE_SHADER_IR_TGSI;
|
|
||||||
state.prog = ureg_get_tokens(ureg, NULL);
|
|
||||||
|
|
||||||
void *cs = ctx->create_compute_state(ctx, &state);
|
|
||||||
ureg_destroy(ureg);
|
|
||||||
ureg_free_tokens(state.prog);
|
|
||||||
|
|
||||||
free(values);
|
|
||||||
return cs;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Create the compute shader that is used to collect the results.
|
/* Create the compute shader that is used to collect the results.
|
||||||
*
|
*
|
||||||
* One compute grid with a single thread is launched for every query result
|
* One compute grid with a single thread is launched for every query result
|
||||||
|
|
|
||||||
|
|
@ -119,7 +119,7 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
||||||
|
|
||||||
void *compute_shader = NULL;
|
void *compute_shader = NULL;
|
||||||
if (test_cs) {
|
if (test_cs) {
|
||||||
compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
|
compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
|
||||||
cache_policy == L2_STREAM, is_copy);
|
cache_policy == L2_STREAM, is_copy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue