From 6584088cd5e6fe2538428b2ae13cbafc62604de2 Mon Sep 17 00:00:00 2001 From: Ganesh Belgur Ramachandra Date: Thu, 5 Oct 2023 06:49:58 -0500 Subject: [PATCH] radeonsi: "create_dma_compute" shader in nir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Marek Olšák Part-of: --- .../drivers/radeonsi/si_compute_blit.c | 4 +- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- .../drivers/radeonsi/si_shaderlib_nir.c | 82 ++++++++++++++ .../drivers/radeonsi/si_shaderlib_tgsi.c | 106 ------------------ .../drivers/radeonsi/si_test_dma_perf.c | 2 +- 5 files changed, 86 insertions(+), 110 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 12459bad043..cdb25ea53da 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -395,7 +395,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res if (!sctx->cs_copy_buffer) { sctx->cs_copy_buffer = si_create_dma_compute_shader( - &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true); + sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true); } si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher, @@ -409,7 +409,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res if (!sctx->cs_clear_buffer) { sctx->cs_clear_buffer = si_create_dma_compute_shader( - &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false); + sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false); } si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 78a35d3a4f1..ec40792ae37 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1650,7 +1650,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha /* si_shaderlib_tgsi.c */ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); -void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, +void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, bool dst_stream_cache_policy, bool is_copy); void *si_create_clear_buffer_rmw_cs(struct si_context *sctx); void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 68976973611..554b3bcfd0b 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -645,3 +645,85 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx) return create_shader_state(sctx, b.shader); } + +/* Create a compute shader implementing clear_buffer or copy_buffer. */ +void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, + bool dst_stream_cache_policy, bool is_copy) +{ + assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); + + const nir_shader_compiler_options *options = + sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE); + + unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT; + if (dst_stream_cache_policy) + store_qualifier |= ACCESS_NON_TEMPORAL; + + /* Don't cache loads, because there is no reuse. */ + unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL; + + nir_builder b = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_dma_compute"); + + unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL); + + b.shader->info.workgroup_size[0] = default_wave_size; + b.shader->info.workgroup_size[1] = 1; + b.shader->info.workgroup_size[2] = 1; + b.shader->info.num_ssbos = 1; + + unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4); + unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned)); + + for (unsigned i = 0; i < num_mem_ops; i++) { + if (i * 4 < num_dwords_per_thread) + inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4); + } + + /* If there are multiple stores, + * the first store writes into 0 * wavesize + tid, + * the 2nd store writes into 1 * wavesize + tid, + * the 3rd store writes into 2 * wavesize + tid, etc. + */ + nir_def *store_address = get_global_ids(&b, 1); + + /* Convert from a "store size unit" into bytes. */ + store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]); + + nir_def *load_address = store_address, *value, *values[num_mem_ops]; + value = nir_undef(&b, 1, 32); + + if (is_copy) { + b.shader->info.num_ssbos++; + } else { + b.shader->info.cs.user_data_components_amd = inst_dwords[0]; + value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]); + } + + /* Distance between a load and a store for latency hiding. */ + unsigned load_store_distance = is_copy ? 8 : 0; + + for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) { + int d = i - load_store_distance; + + if (is_copy && i < num_mem_ops) { + if (i) { + load_address = nir_iadd(&b, load_address, + nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size)); + } + values[i] = nir_load_ssbo(&b, 4, 32, nir_imm_int(&b, 1),load_address, + .access = load_qualifier); + } + + if (d >= 0) { + if (d) { + store_address = nir_iadd(&b, store_address, + nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size)); + } + nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address, + .access = store_qualifier); + } + } + + return create_shader_state(sctx, b.shader); +} diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 3b13d4188b8..8d9b4256fb5 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -67,112 +67,6 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, return *vs; } -/* Create a compute shader implementing clear_buffer or copy_buffer. */ -void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, - bool dst_stream_cache_policy, bool is_copy) -{ - struct si_screen *sscreen = (struct si_screen *)ctx->screen; - assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); - - unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT; - if (dst_stream_cache_policy) - store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY; - - /* Don't cache loads, because there is no reuse. */ - unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY; - - unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4); - unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned)); - - for (unsigned i = 0; i < num_mem_ops; i++) { - if (i * 4 < num_dwords_per_thread) - inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4); - } - - struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); - if (!ureg) - return NULL; - - unsigned default_wave_size = si_determine_wave_size(sscreen, NULL); - - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, default_wave_size); - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); - ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); - - struct ureg_src value; - if (!is_copy) { - ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]); - value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0); - } - - struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); - struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); - struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); - struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); - struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false)); - struct ureg_src srcbuf; - struct ureg_src *values = NULL; - - if (is_copy) { - srcbuf = ureg_DECL_buffer(ureg, 1, false); - values = malloc(num_mem_ops * sizeof(struct ureg_src)); - } - - /* If there are multiple stores, the first store writes into 0*wavesize+tid, - * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc. - */ - ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, default_wave_size * num_mem_ops), - tid); - /* Convert from a "store size unit" into bytes. */ - ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0])); - ureg_MOV(ureg, load_addr, ureg_src(store_addr)); - - /* Distance between a load and a store for latency hiding. */ - unsigned load_store_distance = is_copy ? 8 : 0; - - for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) { - int d = i - load_store_distance; - - if (is_copy && i < num_mem_ops) { - if (i) { - ureg_UADD(ureg, load_addr, ureg_src(load_addr), - ureg_imm1u(ureg, 4 * inst_dwords[i] * default_wave_size)); - } - - values[i] = ureg_src(ureg_DECL_temporary(ureg)); - struct ureg_dst dst = - ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i])); - struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)}; - ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier, - TGSI_TEXTURE_BUFFER, 0); - } - - if (d >= 0) { - if (d) { - ureg_UADD(ureg, store_addr, ureg_src(store_addr), - ureg_imm1u(ureg, 4 * inst_dwords[d] * default_wave_size)); - } - - struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d])); - struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value}; - ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier, - TGSI_TEXTURE_BUFFER, 0); - } - } - ureg_END(ureg); - - struct pipe_compute_state state = {}; - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = ureg_get_tokens(ureg, NULL); - - void *cs = ctx->create_compute_state(ctx, &state); - ureg_destroy(ureg); - ureg_free_tokens(state.prog); - - free(values); - return cs; -} - /* Create the compute shader that is used to collect the results. * * One compute grid with a single thread is launched for every query result diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 0a1d58222cc..b09d9752f5f 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -119,7 +119,7 @@ void si_test_dma_perf(struct si_screen *sscreen) void *compute_shader = NULL; if (test_cs) { - compute_shader = si_create_dma_compute_shader(ctx, cs_dwords_per_thread, + compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread, cache_policy == L2_STREAM, is_copy); }