mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-23 15:30:14 +01:00
radeonsi: use a compute shader to convert unsupported indices format
This commit replace the CPU-conversion of ubyte to ushort by a compute shader. The benefits are: * we don't need to sync anymore * we can allocate the index buffer in VRAM (no need to CPU map it) Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10195 Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26416>
This commit is contained in:
parent
5bdb42b1a2
commit
1a99f50c7f
5 changed files with 79 additions and 8 deletions
|
|
@ -526,6 +526,41 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
|
||||||
|
uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags)
|
||||||
|
{
|
||||||
|
if (!size)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (!sctx->cs_ubyte_to_ushort)
|
||||||
|
sctx->cs_ubyte_to_ushort = si_create_ubyte_to_ushort_compute_shader(sctx);
|
||||||
|
|
||||||
|
enum si_coherency coher = SI_COHERENCY_SHADER;
|
||||||
|
|
||||||
|
si_improve_sync_flags(sctx, dst, src, &flags);
|
||||||
|
|
||||||
|
struct pipe_grid_info info = {};
|
||||||
|
info.block[0] = si_determine_wave_size(sctx->screen, NULL);
|
||||||
|
info.block[1] = 1;
|
||||||
|
info.block[2] = 1;
|
||||||
|
info.grid[0] = DIV_ROUND_UP(size, info.block[0]);
|
||||||
|
info.grid[1] = 1;
|
||||||
|
info.grid[2] = 1;
|
||||||
|
info.last_block[0] = size % info.block[0];
|
||||||
|
|
||||||
|
struct pipe_shader_buffer sb[2] = {};
|
||||||
|
sb[0].buffer = dst;
|
||||||
|
sb[0].buffer_offset = dst_offset;
|
||||||
|
sb[0].buffer_size = dst->width0;
|
||||||
|
|
||||||
|
sb[1].buffer = src;
|
||||||
|
sb[1].buffer_offset = src_offset;
|
||||||
|
sb[1].buffer_size = src->width0;
|
||||||
|
|
||||||
|
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_ubyte_to_ushort, flags, coher,
|
||||||
|
2, sb, 0x1);
|
||||||
|
}
|
||||||
|
|
||||||
static unsigned
|
static unsigned
|
||||||
set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z,
|
set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z,
|
||||||
unsigned work_x, unsigned work_y, unsigned work_z)
|
unsigned work_x, unsigned work_y, unsigned work_z)
|
||||||
|
|
|
||||||
|
|
@ -276,6 +276,8 @@ static void si_destroy_context(struct pipe_context *context)
|
||||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw);
|
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw);
|
||||||
if (sctx->cs_copy_buffer)
|
if (sctx->cs_copy_buffer)
|
||||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
|
sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
|
||||||
|
if (sctx->cs_ubyte_to_ushort)
|
||||||
|
sctx->b.delete_compute_state(&sctx->b, sctx->cs_ubyte_to_ushort);
|
||||||
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) {
|
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) {
|
||||||
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) {
|
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) {
|
||||||
for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) {
|
for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) {
|
||||||
|
|
|
||||||
|
|
@ -1025,6 +1025,7 @@ struct si_context {
|
||||||
void *cs_clear_buffer;
|
void *cs_clear_buffer;
|
||||||
void *cs_clear_buffer_rmw;
|
void *cs_clear_buffer_rmw;
|
||||||
void *cs_copy_buffer;
|
void *cs_copy_buffer;
|
||||||
|
void *cs_ubyte_to_ushort;
|
||||||
void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */
|
void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */
|
||||||
void *cs_clear_render_target;
|
void *cs_clear_render_target;
|
||||||
void *cs_clear_render_target_1d_array;
|
void *cs_clear_render_target_1d_array;
|
||||||
|
|
@ -1491,6 +1492,8 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
|
||||||
unsigned flags, enum si_coherency coher);
|
unsigned flags, enum si_coherency coher);
|
||||||
void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
|
void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
|
||||||
uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags);
|
uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags);
|
||||||
|
void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
|
||||||
|
uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags);
|
||||||
bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
|
bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
|
||||||
struct pipe_resource *src, unsigned src_level, unsigned dstx,
|
struct pipe_resource *src, unsigned src_level, unsigned dstx,
|
||||||
unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
|
unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
|
||||||
|
|
@ -1654,6 +1657,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
||||||
unsigned num_layers);
|
unsigned num_layers);
|
||||||
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||||
bool dst_stream_cache_policy, bool is_copy);
|
bool dst_stream_cache_policy, bool is_copy);
|
||||||
|
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
|
||||||
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
|
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
|
||||||
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
|
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
|
||||||
void *si_clear_12bytes_buffer_shader(struct si_context *sctx);
|
void *si_clear_12bytes_buffer_shader(struct si_context *sctx);
|
||||||
|
|
|
||||||
|
|
@ -624,6 +624,37 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
|
||||||
return create_shader_state(sctx, b.shader);
|
return create_shader_state(sctx, b.shader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
|
||||||
|
{
|
||||||
|
const nir_shader_compiler_options *options =
|
||||||
|
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
|
||||||
|
|
||||||
|
unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
|
||||||
|
|
||||||
|
/* Don't cache loads, because there is no reuse. */
|
||||||
|
unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
|
||||||
|
|
||||||
|
nir_builder b =
|
||||||
|
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "ubyte_to_ushort");
|
||||||
|
|
||||||
|
unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
|
||||||
|
|
||||||
|
b.shader->info.workgroup_size[0] = default_wave_size;
|
||||||
|
b.shader->info.workgroup_size[1] = 1;
|
||||||
|
b.shader->info.workgroup_size[2] = 1;
|
||||||
|
b.shader->info.num_ssbos = 2;
|
||||||
|
|
||||||
|
nir_def *load_address = get_global_ids(&b, 1);
|
||||||
|
nir_def *store_address = nir_imul_imm(&b, load_address, 2);
|
||||||
|
|
||||||
|
nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
|
||||||
|
load_address, .access = load_qualifier);
|
||||||
|
nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
|
||||||
|
store_address, .access = store_qualifier);
|
||||||
|
|
||||||
|
return create_shader_state(sctx, b.shader);
|
||||||
|
}
|
||||||
|
|
||||||
/* Create a compute shader implementing clear_buffer or copy_buffer. */
|
/* Create a compute shader implementing clear_buffer or copy_buffer. */
|
||||||
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||||
bool dst_stream_cache_policy, bool is_copy)
|
bool dst_stream_cache_policy, bool is_copy)
|
||||||
|
|
|
||||||
|
|
@ -2044,23 +2044,22 @@ static void si_draw(struct pipe_context *ctx,
|
||||||
/* Translate or upload, if needed. */
|
/* Translate or upload, if needed. */
|
||||||
/* 8-bit indices are supported on GFX8. */
|
/* 8-bit indices are supported on GFX8. */
|
||||||
if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) {
|
if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) {
|
||||||
unsigned start, count, start_offset, size, offset;
|
unsigned start, count, start_offset, size;
|
||||||
void *ptr;
|
|
||||||
|
|
||||||
si_get_draw_start_count(sctx, info, indirect, draws, num_draws, &start, &count);
|
si_get_draw_start_count(sctx, info, indirect, draws, num_draws, &start, &count);
|
||||||
start_offset = start * 2;
|
start_offset = start * 2;
|
||||||
size = count * 2;
|
size = count * 2;
|
||||||
|
|
||||||
indexbuf = NULL;
|
/* Don't use u_upload_alloc because we don't need to map the buffer for CPU access. */
|
||||||
u_upload_alloc(ctx->stream_uploader, start_offset, size,
|
indexbuf = pipe_buffer_create(&sctx->screen->b, 0, PIPE_USAGE_IMMUTABLE, start_offset + size);
|
||||||
si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr);
|
|
||||||
if (unlikely(!indexbuf))
|
if (unlikely(!indexbuf))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr);
|
si_compute_shorten_ubyte_buffer(sctx, indexbuf, info->index.resource,
|
||||||
|
start_offset, index_offset + start, count,
|
||||||
|
SI_OP_SYNC_AFTER);
|
||||||
|
|
||||||
/* info->start will be added by the drawing code */
|
index_offset = 0;
|
||||||
index_offset = offset - start_offset;
|
|
||||||
index_size = 2;
|
index_size = 2;
|
||||||
} else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) {
|
} else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) {
|
||||||
unsigned start_offset;
|
unsigned start_offset;
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue