radeonsi: use a compute shader to convert unsupported indices format

This commit replace the CPU-conversion of ubyte to ushort by a compute shader. The benefits are: * we don't need to sync anymore * we can allocate the index buffer in VRAM (no need to CPU map it) Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10195 Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26416>
2025-12-23 13:20:14 +01:00 · 2023-11-30 10:15:40 +01:00 · 2023-11-30 10:15:40 +01:00 · 1a99f50c7f
commit 1a99f50c7f
parent 5bdb42b1a2
5 changed files with 79 additions and 8 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -526,6 +526,41 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
   }
 }

+void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+                                     uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags)
+{
+   if (!size)
+      return;
+
+   if (!sctx->cs_ubyte_to_ushort)
+      sctx->cs_ubyte_to_ushort = si_create_ubyte_to_ushort_compute_shader(sctx);
+
+   enum si_coherency coher = SI_COHERENCY_SHADER;
+
+   si_improve_sync_flags(sctx, dst, src, &flags);
+
+   struct pipe_grid_info info = {};
+   info.block[0] = si_determine_wave_size(sctx->screen, NULL);
+   info.block[1] = 1;
+   info.block[2] = 1;
+   info.grid[0] = DIV_ROUND_UP(size, info.block[0]);
+   info.grid[1] = 1;
+   info.grid[2] = 1;
+   info.last_block[0] = size % info.block[0];
+
+   struct pipe_shader_buffer sb[2] = {};
+   sb[0].buffer = dst;
+   sb[0].buffer_offset = dst_offset;
+   sb[0].buffer_size = dst->width0;
+
+   sb[1].buffer = src;
+   sb[1].buffer_offset = src_offset;
+   sb[1].buffer_size = src->width0;
+
+   si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_ubyte_to_ushort, flags, coher,
+                                 2, sb, 0x1);
+}
+
 static unsigned
 set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z,
              unsigned work_x, unsigned work_y, unsigned work_z)
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@ -276,6 +276,8 @@ static void si_destroy_context(struct pipe_context *context)
      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw);
   if (sctx->cs_copy_buffer)
      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
+   if (sctx->cs_ubyte_to_ushort)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_ubyte_to_ushort);
   for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) {
      for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) {
         for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) {
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1025,6 +1025,7 @@ struct si_context {
   void *cs_clear_buffer;
   void *cs_clear_buffer_rmw;
   void *cs_copy_buffer;
+   void *cs_ubyte_to_ushort;
   void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */
   void *cs_clear_render_target;
   void *cs_clear_render_target_1d_array;
@ -1491,6 +1492,8 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource *
                                 unsigned flags, enum si_coherency coher);
 void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
                    uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags);
+void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
+                                     uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags);
 bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
                           struct pipe_resource *src, unsigned src_level, unsigned dstx,
                           unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
@ -1654,6 +1657,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                        unsigned num_layers);
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
                                   bool dst_stream_cache_policy, bool is_copy);
+void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
 void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
 void *si_clear_12bytes_buffer_shader(struct si_context *sctx);
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@ -624,6 +624,37 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
   return create_shader_state(sctx, b.shader);
 }

+void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
+{
+   const nir_shader_compiler_options *options =
+      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
+
+   unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
+
+   /* Don't cache loads, because there is no reuse. */
+   unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
+
+   nir_builder b =
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "ubyte_to_ushort");
+
+   unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
+
+   b.shader->info.workgroup_size[0] = default_wave_size;
+   b.shader->info.workgroup_size[1] = 1;
+   b.shader->info.workgroup_size[2] = 1;
+   b.shader->info.num_ssbos = 2;
+
+   nir_def *load_address = get_global_ids(&b, 1);
+   nir_def *store_address = nir_imul_imm(&b, load_address, 2);
+
+   nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
+                                        load_address, .access = load_qualifier);
+   nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
+                  store_address, .access = store_qualifier);
+
+   return create_shader_state(sctx, b.shader);
+}
+
 /* Create a compute shader implementing clear_buffer or copy_buffer. */
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
                                   bool dst_stream_cache_policy, bool is_copy)
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@ -2044,23 +2044,22 @@ static void si_draw(struct pipe_context *ctx,
      /* Translate or upload, if needed. */
      /* 8-bit indices are supported on GFX8. */
      if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) {
-         unsigned start, count, start_offset, size, offset;
-         void *ptr;
+         unsigned start, count, start_offset, size;

         si_get_draw_start_count(sctx, info, indirect, draws, num_draws, &start, &count);
         start_offset = start * 2;
         size = count * 2;

-         indexbuf = NULL;
-         u_upload_alloc(ctx->stream_uploader, start_offset, size,
-                        si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr);
+         /* Don't use u_upload_alloc because we don't need to map the buffer for CPU access. */
+         indexbuf = pipe_buffer_create(&sctx->screen->b, 0, PIPE_USAGE_IMMUTABLE, start_offset + size);
         if (unlikely(!indexbuf))
            return;

-         util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr);
+         si_compute_shorten_ubyte_buffer(sctx, indexbuf, info->index.resource,
+                                         start_offset, index_offset + start, count,
+                                         SI_OP_SYNC_AFTER);

-         /* info->start will be added by the drawing code */
-         index_offset = offset - start_offset;
+         index_offset = 0;
         index_size = 2;
      } else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) {
         unsigned start_offset;