radeonsi: use a compute shader to convert unsupported indices format

This commit replace the CPU-conversion of ubyte to ushort by a compute shader.
The benefits are:
* we don't need to sync anymore
* we can allocate the index buffer in VRAM (no need to CPU map it)

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10195
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26416>
This commit is contained in:
Pierre-Eric Pelloux-Prayer 2023-11-30 10:15:40 +01:00 committed by Marge Bot
parent 5bdb42b1a2
commit 1a99f50c7f
5 changed files with 79 additions and 8 deletions

View file

@ -526,6 +526,41 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
}
}
void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags)
{
if (!size)
return;
if (!sctx->cs_ubyte_to_ushort)
sctx->cs_ubyte_to_ushort = si_create_ubyte_to_ushort_compute_shader(sctx);
enum si_coherency coher = SI_COHERENCY_SHADER;
si_improve_sync_flags(sctx, dst, src, &flags);
struct pipe_grid_info info = {};
info.block[0] = si_determine_wave_size(sctx->screen, NULL);
info.block[1] = 1;
info.block[2] = 1;
info.grid[0] = DIV_ROUND_UP(size, info.block[0]);
info.grid[1] = 1;
info.grid[2] = 1;
info.last_block[0] = size % info.block[0];
struct pipe_shader_buffer sb[2] = {};
sb[0].buffer = dst;
sb[0].buffer_offset = dst_offset;
sb[0].buffer_size = dst->width0;
sb[1].buffer = src;
sb[1].buffer_offset = src_offset;
sb[1].buffer_size = src->width0;
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_ubyte_to_ushort, flags, coher,
2, sb, 0x1);
}
static unsigned
set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z,
unsigned work_x, unsigned work_y, unsigned work_z)

View file

@ -276,6 +276,8 @@ static void si_destroy_context(struct pipe_context *context)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw);
if (sctx->cs_copy_buffer)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
if (sctx->cs_ubyte_to_ushort)
sctx->b.delete_compute_state(&sctx->b, sctx->cs_ubyte_to_ushort);
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) {
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) {
for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) {

View file

@ -1025,6 +1025,7 @@ struct si_context {
void *cs_clear_buffer;
void *cs_clear_buffer_rmw;
void *cs_copy_buffer;
void *cs_ubyte_to_ushort;
void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */
void *cs_clear_render_target;
void *cs_clear_render_target_1d_array;
@ -1491,6 +1492,8 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
unsigned flags, enum si_coherency coher);
void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags);
void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags);
bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
struct pipe_resource *src, unsigned src_level, unsigned dstx,
unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
@ -1654,6 +1657,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
unsigned num_layers);
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy);
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
void *si_clear_12bytes_buffer_shader(struct si_context *sctx);

View file

@ -624,6 +624,37 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
return create_shader_state(sctx, b.shader);
}
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
{
const nir_shader_compiler_options *options =
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
/* Don't cache loads, because there is no reuse. */
unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
nir_builder b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "ubyte_to_ushort");
unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
b.shader->info.workgroup_size[0] = default_wave_size;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
b.shader->info.num_ssbos = 2;
nir_def *load_address = get_global_ids(&b, 1);
nir_def *store_address = nir_imul_imm(&b, load_address, 2);
nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
load_address, .access = load_qualifier);
nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
store_address, .access = store_qualifier);
return create_shader_state(sctx, b.shader);
}
/* Create a compute shader implementing clear_buffer or copy_buffer. */
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
bool dst_stream_cache_policy, bool is_copy)

View file

@ -2044,23 +2044,22 @@ static void si_draw(struct pipe_context *ctx,
/* Translate or upload, if needed. */
/* 8-bit indices are supported on GFX8. */
if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) {
unsigned start, count, start_offset, size, offset;
void *ptr;
unsigned start, count, start_offset, size;
si_get_draw_start_count(sctx, info, indirect, draws, num_draws, &start, &count);
start_offset = start * 2;
size = count * 2;
indexbuf = NULL;
u_upload_alloc(ctx->stream_uploader, start_offset, size,
si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr);
/* Don't use u_upload_alloc because we don't need to map the buffer for CPU access. */
indexbuf = pipe_buffer_create(&sctx->screen->b, 0, PIPE_USAGE_IMMUTABLE, start_offset + size);
if (unlikely(!indexbuf))
return;
util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr);
si_compute_shorten_ubyte_buffer(sctx, indexbuf, info->index.resource,
start_offset, index_offset + start, count,
SI_OP_SYNC_AFTER);
/* info->start will be added by the drawing code */
index_offset = offset - start_offset;
index_offset = 0;
index_size = 2;
} else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) {
unsigned start_offset;