mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 04:30:10 +01:00
radeonsi: do Z-only or S-only HTILE clear using a compute shader doing RMW
This adds a clear_buffer compute shader that does read-modify-write to update a subset of bits in HTILE. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10003>
This commit is contained in:
parent
84fa21a611
commit
06b6af596c
5 changed files with 182 additions and 4 deletions
|
|
@ -42,6 +42,15 @@ void si_init_buffer_clear(struct si_clear_info *info,
|
|||
info->offset = offset;
|
||||
info->size = size;
|
||||
info->clear_value = clear_value;
|
||||
info->writemask = 0xffffffff;
|
||||
}
|
||||
|
||||
static void si_init_buffer_clear_rmw(struct si_clear_info *info,
|
||||
struct pipe_resource *resource, uint64_t offset,
|
||||
uint32_t size, uint32_t clear_value, uint32_t writemask)
|
||||
{
|
||||
si_init_buffer_clear(info, resource, offset, size, clear_value);
|
||||
info->writemask = writemask;
|
||||
}
|
||||
|
||||
void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
|
||||
|
|
@ -66,10 +75,18 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
|
|||
|
||||
/* Execute clears. */
|
||||
for (unsigned i = 0; i < num_clears; i++) {
|
||||
/* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */
|
||||
si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size,
|
||||
&info[i].clear_value, 4, SI_OP_SKIP_CACHE_INV_BEFORE,
|
||||
SI_COHERENCY_CP, SI_COMPUTE_CLEAR_METHOD);
|
||||
assert(info[i].size > 0);
|
||||
|
||||
if (info[i].writemask != 0xffffffff) {
|
||||
si_compute_clear_buffer_rmw(sctx, info[i].resource, info[i].offset, info[i].size,
|
||||
info[i].clear_value, info[i].writemask,
|
||||
SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP);
|
||||
} else {
|
||||
/* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */
|
||||
si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size,
|
||||
&info[i].clear_value, 4, SI_OP_SKIP_CACHE_INV_BEFORE,
|
||||
SI_COHERENCY_CP, SI_COMPUTE_CLEAR_METHOD);
|
||||
}
|
||||
}
|
||||
|
||||
/* Wait for idle. */
|
||||
|
|
@ -885,6 +902,41 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
|
|||
update_db_depth_clear = true;
|
||||
update_db_stencil_clear = true;
|
||||
}
|
||||
} else {
|
||||
/* Z-only or S-only clear when both Z/S are present using a read-modify-write
|
||||
* compute shader.
|
||||
*
|
||||
* If we get both clears but only one of them can be fast-cleared, we use
|
||||
* the draw-based fast clear to do both at the same time.
|
||||
*/
|
||||
const uint32_t htile_depth_writemask = 0xfffffc0f;
|
||||
const uint32_t htile_stencil_writemask = 0x000003f0;
|
||||
|
||||
if (htile_size &&
|
||||
!(*buffers & PIPE_CLEAR_STENCIL) &&
|
||||
si_can_fast_clear_depth(zstex, level, depth, *buffers)) {
|
||||
/* Z-only clear with stencil left intact. */
|
||||
assert(num_clears < ARRAY_SIZE(info));
|
||||
si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
|
||||
htile_size, si_get_htile_clear_value(zstex, depth),
|
||||
htile_depth_writemask);
|
||||
clear_types |= SI_CLEAR_TYPE_HTILE;
|
||||
*buffers &= ~PIPE_CLEAR_DEPTH;
|
||||
zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
|
||||
update_db_depth_clear = true;
|
||||
} else if (htile_size &&
|
||||
!(*buffers & PIPE_CLEAR_DEPTH) &&
|
||||
si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) {
|
||||
/* Stencil-only clear with depth left intact. */
|
||||
assert(num_clears < ARRAY_SIZE(info));
|
||||
si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
|
||||
htile_size, si_get_htile_clear_value(zstex, depth),
|
||||
htile_stencil_writemask);
|
||||
clear_types |= SI_CLEAR_TYPE_HTILE;
|
||||
*buffers &= ~PIPE_CLEAR_STENCIL;
|
||||
zstex->stencil_cleared_level_mask |= BITFIELD_BIT(level);
|
||||
update_db_stencil_clear = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Update DB_DEPTH_CLEAR. */
|
||||
|
|
|
|||
|
|
@ -115,6 +115,80 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear a buffer using read-modify-write with a 32-bit write bitmask.
|
||||
* The clear value has 32 bits.
|
||||
*/
|
||||
void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *dst,
|
||||
unsigned dst_offset, unsigned size,
|
||||
uint32_t clear_value, uint32_t writebitmask,
|
||||
unsigned flags, enum si_coherency coher)
|
||||
{
|
||||
struct pipe_context *ctx = &sctx->b;
|
||||
|
||||
assert(dst_offset % 4 == 0);
|
||||
assert(size % 4 == 0);
|
||||
|
||||
assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
|
||||
|
||||
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
||||
sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
|
||||
|
||||
/* Save states. */
|
||||
void *saved_cs = sctx->cs_shader_state.program;
|
||||
struct pipe_shader_buffer saved_sb = {};
|
||||
si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
|
||||
|
||||
unsigned saved_writable_mask = 0;
|
||||
if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
|
||||
(1u << si_get_shaderbuf_slot(0)))
|
||||
saved_writable_mask |= 1 << 0;
|
||||
|
||||
/* Use buffer_load_dwordx4 and buffer_store_dwordx4 per thread. */
|
||||
unsigned dwords_per_instruction = 4;
|
||||
unsigned wave_size = sctx->screen->compute_wave_size;
|
||||
unsigned dwords_per_wave = dwords_per_instruction * wave_size;
|
||||
|
||||
unsigned num_dwords = size / 4;
|
||||
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
|
||||
|
||||
struct pipe_grid_info info = {};
|
||||
info.block[0] = MIN2(wave_size, num_instructions);
|
||||
info.block[1] = 1;
|
||||
info.block[2] = 1;
|
||||
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
|
||||
info.grid[1] = 1;
|
||||
info.grid[2] = 1;
|
||||
|
||||
struct pipe_shader_buffer sb = {};
|
||||
sb.buffer = dst;
|
||||
sb.buffer_offset = dst_offset;
|
||||
sb.buffer_size = size;
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
|
||||
|
||||
sctx->cs_user_data[0] = clear_value & writebitmask;
|
||||
sctx->cs_user_data[1] = ~writebitmask;
|
||||
|
||||
if (!sctx->cs_clear_buffer_rmw)
|
||||
sctx->cs_clear_buffer_rmw = si_create_clear_buffer_rmw_cs(&sctx->b);
|
||||
|
||||
ctx->bind_compute_state(ctx, sctx->cs_clear_buffer_rmw);
|
||||
|
||||
si_launch_grid_internal(sctx, &info, saved_cs, flags);
|
||||
|
||||
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
|
||||
|
||||
if (flags & SI_OP_SYNC_AFTER)
|
||||
sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0;
|
||||
|
||||
if (cache_policy != L2_BYPASS)
|
||||
si_resource(dst)->TC_L2_dirty = true;
|
||||
|
||||
/* Restore states. */
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
|
||||
pipe_resource_reference(&saved_sb.buffer, NULL);
|
||||
}
|
||||
|
||||
static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
unsigned dst_offset, unsigned size,
|
||||
const uint32_t *clear_value, unsigned flags,
|
||||
|
|
|
|||
|
|
@ -239,6 +239,8 @@ static void si_destroy_context(struct pipe_context *context)
|
|||
sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
|
||||
if (sctx->cs_clear_buffer)
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
|
||||
if (sctx->cs_clear_buffer_rmw)
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw);
|
||||
if (sctx->cs_copy_buffer)
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
|
||||
if (sctx->cs_copy_image)
|
||||
|
|
|
|||
|
|
@ -955,6 +955,7 @@ struct si_context {
|
|||
void *vs_blit_color_layered;
|
||||
void *vs_blit_texcoord;
|
||||
void *cs_clear_buffer;
|
||||
void *cs_clear_buffer_rmw;
|
||||
void *cs_copy_buffer;
|
||||
void *cs_copy_image;
|
||||
void *cs_copy_image_1d_array;
|
||||
|
|
@ -1368,6 +1369,7 @@ struct si_clear_info {
|
|||
uint64_t offset;
|
||||
uint32_t size;
|
||||
uint32_t clear_value;
|
||||
uint32_t writemask;
|
||||
};
|
||||
|
||||
enum pipe_format si_simplify_cb_format(enum pipe_format format);
|
||||
|
|
@ -1406,6 +1408,10 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
|||
uint64_t offset, uint64_t size, uint32_t *clear_value,
|
||||
uint32_t clear_value_size, unsigned flags,
|
||||
enum si_coherency coher, enum si_clear_method method);
|
||||
void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *dst,
|
||||
unsigned dst_offset, unsigned size,
|
||||
uint32_t clear_value, uint32_t writebitmask,
|
||||
unsigned flags, enum si_coherency coher);
|
||||
void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
|
||||
uint64_t size, unsigned value, unsigned flags);
|
||||
void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
|
||||
|
|
@ -1539,6 +1545,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
|||
void *si_create_fixed_func_tcs(struct si_context *sctx);
|
||||
void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread,
|
||||
bool dst_stream_cache_policy, bool is_copy);
|
||||
void *si_create_clear_buffer_rmw_cs(struct pipe_context *ctx);
|
||||
void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
|
||||
void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
|
||||
void *si_create_dcc_decompress_cs(struct pipe_context *ctx);
|
||||
|
|
|
|||
|
|
@ -212,6 +212,49 @@ void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords
|
|||
return cs;
|
||||
}
|
||||
|
||||
/* Create a compute shader implementing clear_buffer or copy_buffer. */
|
||||
void *si_create_clear_buffer_rmw_cs(struct pipe_context *ctx)
|
||||
{
|
||||
const char *text = "COMP\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
|
||||
"PROPERTY CS_USER_DATA_COMPONENTS_AMD 2\n"
|
||||
"DCL SV[0], THREAD_ID\n"
|
||||
"DCL SV[1], BLOCK_ID\n"
|
||||
"DCL SV[2], CS_USER_DATA_AMD\n"
|
||||
"DCL BUFFER[0]\n"
|
||||
"DCL TEMP[0..1]\n"
|
||||
"IMM[0] UINT32 {64, 16, 0, 0}\n"
|
||||
/* ADDRESS = BLOCK_ID * 64 + THREAD_ID; */
|
||||
"UMAD TEMP[0].x, SV[1].xxxx, IMM[0].xxxx, SV[0].xxxx\n"
|
||||
/* ADDRESS = ADDRESS * 16; (byte offset, loading one vec4 per thread) */
|
||||
"UMUL TEMP[0].x, TEMP[0].xxxx, IMM[0].yyyy\n"
|
||||
"LOAD TEMP[1], BUFFER[0], TEMP[0].xxxx\n"
|
||||
/* DATA &= inverted_writemask; */
|
||||
"AND TEMP[1], TEMP[1], SV[2].yyyy\n"
|
||||
/* DATA |= clear_value_masked; */
|
||||
"OR TEMP[1], TEMP[1], SV[2].xxxx\n"
|
||||
"STORE BUFFER[0].xyzw, TEMP[0], TEMP[1]%s\n"
|
||||
"END\n";
|
||||
char final_text[2048];
|
||||
struct tgsi_token tokens[1024];
|
||||
struct pipe_compute_state state = {0};
|
||||
|
||||
snprintf(final_text, sizeof(final_text), text,
|
||||
SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ", STREAM_CACHE_POLICY" : "");
|
||||
|
||||
if (!tgsi_text_translate(final_text, tokens, ARRAY_SIZE(tokens))) {
|
||||
assert(false);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
state.ir_type = PIPE_SHADER_IR_TGSI;
|
||||
state.prog = tokens;
|
||||
|
||||
return ctx->create_compute_state(ctx, &state);
|
||||
}
|
||||
|
||||
/* Create a compute shader that copies DCC from one buffer to another
|
||||
* where each DCC buffer has a different layout.
|
||||
*
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue