radeonsi: do Z-only or S-only HTILE clear using a compute shader doing RMW

This adds a clear_buffer compute shader that does read-modify-write to
update a subset of bits in HTILE.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10003>
This commit is contained in:
Marek Olšák 2021-03-21 16:57:15 -04:00 committed by Marge Bot
parent 84fa21a611
commit 06b6af596c
5 changed files with 182 additions and 4 deletions

View file

@ -42,6 +42,15 @@ void si_init_buffer_clear(struct si_clear_info *info,
info->offset = offset;
info->size = size;
info->clear_value = clear_value;
info->writemask = 0xffffffff;
}
static void si_init_buffer_clear_rmw(struct si_clear_info *info,
struct pipe_resource *resource, uint64_t offset,
uint32_t size, uint32_t clear_value, uint32_t writemask)
{
si_init_buffer_clear(info, resource, offset, size, clear_value);
info->writemask = writemask;
}
void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
@ -66,10 +75,18 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
/* Execute clears. */
for (unsigned i = 0; i < num_clears; i++) {
/* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */
si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size,
&info[i].clear_value, 4, SI_OP_SKIP_CACHE_INV_BEFORE,
SI_COHERENCY_CP, SI_COMPUTE_CLEAR_METHOD);
assert(info[i].size > 0);
if (info[i].writemask != 0xffffffff) {
si_compute_clear_buffer_rmw(sctx, info[i].resource, info[i].offset, info[i].size,
info[i].clear_value, info[i].writemask,
SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP);
} else {
/* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */
si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size,
&info[i].clear_value, 4, SI_OP_SKIP_CACHE_INV_BEFORE,
SI_COHERENCY_CP, SI_COMPUTE_CLEAR_METHOD);
}
}
/* Wait for idle. */
@ -885,6 +902,41 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
update_db_depth_clear = true;
update_db_stencil_clear = true;
}
} else {
/* Z-only or S-only clear when both Z/S are present using a read-modify-write
* compute shader.
*
* If we get both clears but only one of them can be fast-cleared, we use
* the draw-based fast clear to do both at the same time.
*/
const uint32_t htile_depth_writemask = 0xfffffc0f;
const uint32_t htile_stencil_writemask = 0x000003f0;
if (htile_size &&
!(*buffers & PIPE_CLEAR_STENCIL) &&
si_can_fast_clear_depth(zstex, level, depth, *buffers)) {
/* Z-only clear with stencil left intact. */
assert(num_clears < ARRAY_SIZE(info));
si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
htile_size, si_get_htile_clear_value(zstex, depth),
htile_depth_writemask);
clear_types |= SI_CLEAR_TYPE_HTILE;
*buffers &= ~PIPE_CLEAR_DEPTH;
zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
update_db_depth_clear = true;
} else if (htile_size &&
!(*buffers & PIPE_CLEAR_DEPTH) &&
si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) {
/* Stencil-only clear with depth left intact. */
assert(num_clears < ARRAY_SIZE(info));
si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
htile_size, si_get_htile_clear_value(zstex, depth),
htile_stencil_writemask);
clear_types |= SI_CLEAR_TYPE_HTILE;
*buffers &= ~PIPE_CLEAR_STENCIL;
zstex->stencil_cleared_level_mask |= BITFIELD_BIT(level);
update_db_stencil_clear = true;
}
}
/* Update DB_DEPTH_CLEAR. */

View file

@ -115,6 +115,80 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
}
}
/**
* Clear a buffer using read-modify-write with a 32-bit write bitmask.
* The clear value has 32 bits.
*/
void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *dst,
unsigned dst_offset, unsigned size,
uint32_t clear_value, uint32_t writebitmask,
unsigned flags, enum si_coherency coher)
{
struct pipe_context *ctx = &sctx->b;
assert(dst_offset % 4 == 0);
assert(size % 4 == 0);
assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
/* Save states. */
void *saved_cs = sctx->cs_shader_state.program;
struct pipe_shader_buffer saved_sb = {};
si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
unsigned saved_writable_mask = 0;
if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
(1u << si_get_shaderbuf_slot(0)))
saved_writable_mask |= 1 << 0;
/* Use buffer_load_dwordx4 and buffer_store_dwordx4 per thread. */
unsigned dwords_per_instruction = 4;
unsigned wave_size = sctx->screen->compute_wave_size;
unsigned dwords_per_wave = dwords_per_instruction * wave_size;
unsigned num_dwords = size / 4;
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
struct pipe_grid_info info = {};
info.block[0] = MIN2(wave_size, num_instructions);
info.block[1] = 1;
info.block[2] = 1;
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
info.grid[1] = 1;
info.grid[2] = 1;
struct pipe_shader_buffer sb = {};
sb.buffer = dst;
sb.buffer_offset = dst_offset;
sb.buffer_size = size;
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
sctx->cs_user_data[0] = clear_value & writebitmask;
sctx->cs_user_data[1] = ~writebitmask;
if (!sctx->cs_clear_buffer_rmw)
sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(&sctx->b);
ctx->bind_compute_state(ctx, sctx->cs_clear_buffer_rmw);
si_launch_grid_internal(sctx, &info, saved_cs, flags);
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
if (flags & SI_OP_SYNC_AFTER)
sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0;
if (cache_policy != L2_BYPASS)
si_resource(dst)->TC_L2_dirty = true;
/* Restore states. */
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
pipe_resource_reference(&saved_sb.buffer, NULL);
}
static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
unsigned dst_offset, unsigned size,
const uint32_t *clear_value, unsigned flags,

View file

@ -239,6 +239,8 @@ static void si_destroy_context(struct pipe_context *context)
sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
if (sctx->cs_clear_buffer)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
if (sctx->cs_clear_buffer_rmw)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw);
if (sctx->cs_copy_buffer)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
if (sctx->cs_copy_image)

View file

@ -955,6 +955,7 @@ struct si_context {
void *vs_blit_color_layered;
void *vs_blit_texcoord;
void *cs_clear_buffer;
void *cs_clear_buffer_rmw;
void *cs_copy_buffer;
void *cs_copy_image;
void *cs_copy_image_1d_array;
@ -1368,6 +1369,7 @@ struct si_clear_info {
uint64_t offset;
uint32_t size;
uint32_t clear_value;
uint32_t writemask;
};
enum pipe_format si_simplify_cb_format(enum pipe_format format);
@ -1406,6 +1408,10 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
uint64_t offset, uint64_t size, uint32_t *clear_value,
uint32_t clear_value_size, unsigned flags,
enum si_coherency coher, enum si_clear_method method);
void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *dst,
unsigned dst_offset, unsigned size,
uint32_t clear_value, uint32_t writebitmask,
unsigned flags, enum si_coherency coher);
void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
uint64_t size, unsigned value, unsigned flags);
void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
@ -1539,6 +1545,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
void *si_create_fixed_func_tcs(struct si_context *sctx);
void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy);
void *si_create_clear_buffer_rmw_cs(struct pipe_context *ctx);
void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
void *si_create_dcc_decompress_cs(struct pipe_context *ctx);

View file

@ -212,6 +212,49 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords
return cs;
}
/* Create a compute shader implementing clear_buffer or copy_buffer. */
void *si_create_clear_buffer_rmw_cs(struct pipe_context *ctx)
{
const char *text = "COMP\n"
"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
"PROPERTY CS_USER_DATA_COMPONENTS_AMD 2\n"
"DCL SV[0], THREAD_ID\n"
"DCL SV[1], BLOCK_ID\n"
"DCL SV[2], CS_USER_DATA_AMD\n"
"DCL BUFFER[0]\n"
"DCL TEMP[0..1]\n"
"IMM[0] UINT32 {64, 16, 0, 0}\n"
/* ADDRESS = BLOCK_ID * 64 + THREAD_ID; */
"UMAD TEMP[0].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx\n"
/* ADDRESS = ADDRESS * 16; (byte offset, loading one vec4 per thread) */
"UMUL TEMP[0].x, TEMP[0].xxxx, IMM[0].yyyy\n"
"LOAD TEMP[1], BUFFER[0], TEMP[0].xxxx\n"
/* DATA &= inverted_writemask; */
"AND TEMP[1], TEMP[1], SV[2].yyyy\n"
/* DATA |= clear_value_masked; */
"OR TEMP[1], TEMP[1], SV[2].xxxx\n"
"STORE BUFFER[0].xyzw, TEMP[0], TEMP[1]%s\n"
"END\n";
char final_text[2048];
struct tgsi_token tokens[1024];
struct pipe_compute_state state = {0};
snprintf(final_text, sizeof(final_text), text,
SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ", STREAM_CACHE_POLICY" : "");
if (!tgsi_text_translate(final_text, tokens, ARRAY_SIZE(tokens))) {
assert(false);
return NULL;
}
state.ir_type = PIPE_SHADER_IR_TGSI;
state.prog = tokens;
return ctx->create_compute_state(ctx, &state);
}
/* Create a compute shader that copies DCC from one buffer to another
* where each DCC buffer has a different layout.
*