From f9b527a9a501bf15c33957ac7ae431d996aea606 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 4 Apr 2021 20:19:55 -0400 Subject: [PATCH] radeonsi: unify internal compute with SSBOs in si_launch_grid_internal_ssbos just deduplicate the code Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/gfx10_query.c | 6 +- .../drivers/radeonsi/si_compute_blit.c | 175 ++++++------------ src/gallium/drivers/radeonsi/si_pipe.h | 4 + src/gallium/drivers/radeonsi/si_query.c | 10 +- src/gallium/drivers/radeonsi/si_query.h | 3 - src/gallium/drivers/radeonsi/si_state.c | 18 -- 6 files changed, 62 insertions(+), 154 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_query.c b/src/gallium/drivers/radeonsi/gfx10_query.c index 191d28abde8..1a430090161 100644 --- a/src/gallium/drivers/radeonsi/gfx10_query.c +++ b/src/gallium/drivers/radeonsi/gfx10_query.c @@ -387,7 +387,6 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s } sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer); - sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6); if (wait) { uint64_t va; @@ -403,8 +402,9 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0); } - si_launch_grid_internal((struct si_context *)&sctx->b, &grid, sctx->sh_query_result_shader, - SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_AFTER); + si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader, + SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER, + 3, ssbo, 0x6); if (qbuf == query->last) break; diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index c8bdee34546..d377c24774f 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -116,6 +116,47 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf } } +void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info, + void *shader, unsigned flags, enum si_coherency coher, + unsigned num_buffers, const struct pipe_shader_buffer *buffers, + unsigned writeable_bitmask) +{ + if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) + sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + + /* Save states. */ + struct pipe_shader_buffer saved_sb[3] = {}; + assert(num_buffers <= ARRAY_SIZE(saved_sb)); + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, num_buffers, saved_sb); + + unsigned saved_writable_mask = 0; + for (unsigned i = 0; i < num_buffers; i++) { + if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & + (1u << si_get_shaderbuf_slot(i))) + saved_writable_mask |= 1 << i; + } + + /* Bind buffers and launch compute. */ + sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, buffers, + writeable_bitmask); + si_launch_grid_internal(sctx, info, shader, flags); + + /* Do cache flushing at the end. */ + if (get_cache_policy(sctx, coher, 0) == L2_BYPASS) { + if (flags & SI_OP_SYNC_AFTER) + sctx->flags |= SI_CONTEXT_WB_L2; + } else { + while (writeable_bitmask) + si_resource(buffers[u_bit_scan(&writeable_bitmask)].buffer)->TC_L2_dirty = true; + } + + /* Restore states. */ + sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, saved_sb, + saved_writable_mask); + for (int i = 0; i < num_buffers; i++) + pipe_resource_reference(&saved_sb[i].buffer, NULL); +} + /** * Clear a buffer using read-modify-write with a 32-bit write bitmask. * The clear value has 32 bits. @@ -125,25 +166,11 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource * uint32_t clear_value, uint32_t writebitmask, unsigned flags, enum si_coherency coher) { - struct pipe_context *ctx = &sctx->b; - assert(dst_offset % 4 == 0); assert(size % 4 == 0); assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); - if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) - sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); - - /* Save states. */ - struct pipe_shader_buffer saved_sb = {}; - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); - - unsigned saved_writable_mask = 0; - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(0))) - saved_writable_mask |= 1 << 0; - /* Use buffer_load_dwordx4 and buffer_store_dwordx4 per thread. */ unsigned dwords_per_instruction = 4; unsigned wave_size = sctx->screen->compute_wave_size; @@ -164,7 +191,6 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource * sb.buffer = dst; sb.buffer_offset = dst_offset; sb.buffer_size = size; - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); sctx->cs_user_data[0] = clear_value & writebitmask; sctx->cs_user_data[1] = ~writebitmask; @@ -172,19 +198,8 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource * if (!sctx->cs_clear_buffer_rmw) sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(&sctx->b); - si_launch_grid_internal(sctx, &info, sctx->cs_clear_buffer_rmw, flags); - - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); - - if (flags & SI_OP_SYNC_AFTER) - sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0; - - if (cache_policy != L2_BYPASS) - si_resource(dst)->TC_L2_dirty = true; - - /* Restore states. */ - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); - pipe_resource_reference(&saved_sb.buffer, NULL); + si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer_rmw, flags, coher, + 1, &sb, 0x1); } static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst, @@ -198,24 +213,11 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe assert(size % 4 == 0); unsigned size_12 = DIV_ROUND_UP(size, 12); - if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) - sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); - - struct pipe_shader_buffer saved_sb = {0}; - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); - - unsigned saved_writable_mask = 0; - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(0))) - saved_writable_mask = 1; - struct pipe_shader_buffer sb = {0}; sb.buffer = dst; sb.buffer_offset = dst_offset; sb.buffer_size = size; - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); - memcpy(sctx->cs_user_data, clear_value, 12); struct pipe_grid_info info = {0}; @@ -231,10 +233,8 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe info.grid[1] = 1; info.grid[2] = 1; - si_launch_grid_internal(sctx, &info, sctx->cs_clear_12bytes_buffer, flags); - - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); - pipe_resource_reference(&saved_sb.buffer, NULL); + si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_12bytes_buffer, flags, coher, + 1, &sb, 0x1); } static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_resource *dst, @@ -243,8 +243,6 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res const uint32_t *clear_value, unsigned clear_value_size, unsigned flags, enum si_coherency coher) { - struct pipe_context *ctx = &sctx->b; - assert(src_offset % 4 == 0); assert(dst_offset % 4 == 0); assert(size % 4 == 0); @@ -252,20 +250,6 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); assert(!src || src_offset + size <= src->width0); - if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) - sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); - - /* Save states. */ - struct pipe_shader_buffer saved_sb[2] = {}; - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb); - - unsigned saved_writable_mask = 0; - for (unsigned i = 0; i < (src ? 2 : 1); i++) { - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(i))) - saved_writable_mask |= 1 << i; - } - /* The memory accesses are coalesced, meaning that the 1st instruction writes * the 1st contiguous block of data for the whole wave, the 2nd instruction * writes the 2nd contiguous block of data, etc. @@ -300,14 +284,13 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res sb[1].buffer_offset = src_offset; sb[1].buffer_size = size; - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb, 0x1); - if (!sctx->cs_copy_buffer) { sctx->cs_copy_buffer = si_create_dma_compute_shader( &sctx->b, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true); } - si_launch_grid_internal(sctx, &info, sctx->cs_copy_buffer, flags); + si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher, + 2, sb, 0x1); } else { assert(clear_value_size >= 4 && clear_value_size <= 16 && util_is_power_of_two_or_zero(clear_value_size)); @@ -315,28 +298,14 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res for (unsigned i = 0; i < 4; i++) sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)]; - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb, 0x1); - if (!sctx->cs_clear_buffer) { sctx->cs_clear_buffer = si_create_dma_compute_shader( &sctx->b, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false); } - si_launch_grid_internal(sctx, &info, sctx->cs_clear_buffer, flags); + si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher, + 1, sb, 0x1); } - - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); - - if (flags & SI_OP_SYNC_AFTER) - sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0; - - if (cache_policy != L2_BYPASS) - si_resource(dst)->TC_L2_dirty = true; - - /* Restore states. */ - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask); - for (int i = 0; i < 2; i++) - pipe_resource_reference(&saved_sb[i].buffer, NULL); } void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, @@ -653,19 +622,6 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) { - struct pipe_context *ctx = &sctx->b; - - sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU); - - /* Save states. */ - struct pipe_shader_buffer saved_sb = {}; - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); - - unsigned saved_writable_mask = 0; - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(0))) - saved_writable_mask |= 1 << 0; - /* Set the DCC buffer. */ assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX); assert(tex->surface.display_dcc_offset && tex->surface.display_dcc_offset <= UINT_MAX); @@ -676,7 +632,6 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) sb.buffer = &tex->buffer.b.b; sb.buffer_offset = tex->surface.display_dcc_offset; sb.buffer_size = tex->buffer.bo_size - sb.buffer_offset; - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); sctx->cs_user_data[0] = tex->surface.meta_offset - tex->surface.display_dcc_offset; sctx->cs_user_data[1] = (tex->surface.u.gfx9.color.dcc_pitch_max + 1) | @@ -709,33 +664,17 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) info.grid[1] = DIV_ROUND_UP(height, info.block[1]); info.grid[2] = 1; - si_launch_grid_internal(sctx, &info, sctx->cs_dcc_retile, SI_OP_SYNC_BEFORE); + si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_dcc_retile, SI_OP_SYNC_BEFORE, + SI_COHERENCY_CB_META, 1, &sb, 0x1); /* Don't flush caches. L2 will be flushed by the kernel fence. */ - - /* Restore states. */ - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); - pipe_resource_reference(&saved_sb.buffer, NULL); } void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value, unsigned flags, enum si_coherency coher) { - struct pipe_context *ctx = &sctx->b; struct si_texture *tex = (struct si_texture*)res; - if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) - sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); - - /* Save states. */ - struct pipe_shader_buffer saved_sb = {}; - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); - - unsigned saved_writable_mask = 0; - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(0))) - saved_writable_mask |= 1 << 0; - /* Set the DCC buffer. */ assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX); assert(tex->buffer.bo_size <= UINT_MAX); @@ -744,7 +683,6 @@ void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uin sb.buffer = &tex->buffer.b.b; sb.buffer_offset = tex->surface.meta_offset; sb.buffer_size = tex->buffer.bo_size - sb.buffer_offset; - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1); sctx->cs_user_data[0] = (tex->surface.u.gfx9.color.dcc_pitch_max + 1) | (tex->surface.u.gfx9.color.dcc_height << 16); @@ -776,16 +714,7 @@ void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uin info.grid[1] = DIV_ROUND_UP(height, info.block[1]); info.grid[2] = depth; - si_launch_grid_internal(sctx, &info, *shader, flags); - - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, tex->surface.meta_size); - - if (flags & SI_OP_SYNC_AFTER) - sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0; - - /* Restore states. */ - ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); - pipe_resource_reference(&saved_sb.buffer, NULL); + si_launch_grid_internal_ssbos(sctx, &info, *shader, flags, coher, 1, &sb, 0x1); } /* Expand FMASK to make it identity, so that image stores can ignore it. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 6eb328ab3db..27b4b8af050 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1402,6 +1402,10 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, enum si_cache_policy cache_policy); void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info, void *shader, unsigned flags); +void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info, + void *shader, unsigned flags, enum si_coherency coher, + unsigned num_buffers, const struct pipe_shader_buffer *buffers, + unsigned writeable_bitmask); enum si_clear_method { SI_CP_DMA_CLEAR_METHOD, SI_COMPUTE_CLEAR_METHOD, diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index c5fd7d2722c..47165735afb 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -1500,8 +1500,6 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q ssbo[2] = ssbo[1]; - sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader); - grid.block[0] = 1; grid.block[1] = 1; grid.block[2] = 1; @@ -1566,8 +1564,6 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q si_resource(resource)->TC_L2_dirty = true; } - sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 1 << 2); - if (wait && qbuf == &query->buffer) { uint64_t va; @@ -1580,9 +1576,9 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL); } - - sctx->b.launch_grid(&sctx->b, &grid); - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; + si_launch_grid_internal_ssbos(sctx, &grid, sctx->query_result_shader, + SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER, + 3, ssbo, 0x4); } si_restore_qbo_state(sctx, &saved_state); diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index 89e75eca548..c608af694e2 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -292,10 +292,7 @@ int si_get_perfcounter_group_info(struct si_screen *, unsigned index, struct pipe_driver_query_group_info *info); struct si_qbo_state { - void *saved_compute; struct pipe_constant_buffer saved_const0; - struct pipe_shader_buffer saved_ssbo[3]; - unsigned saved_ssbo_writable_mask; }; #endif /* SI_QUERY_H */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index f1c1fd0f4db..54f87302380 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -1333,30 +1333,12 @@ void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enab void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st) { - st->saved_compute = sctx->cs_shader_state.program; - si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); - si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); - - st->saved_ssbo_writable_mask = 0; - - for (unsigned i = 0; i < 3; i++) { - if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask & - (1u << si_get_shaderbuf_slot(i))) - st->saved_ssbo_writable_mask |= 1 << i; - } } void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st) { - sctx->b.bind_compute_state(&sctx->b, st->saved_compute); - sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, true, &st->saved_const0); - - sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo, - st->saved_ssbo_writable_mask); - for (unsigned i = 0; i < 3; ++i) - pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); } static void si_emit_db_render_state(struct si_context *sctx)