radeonsi/sqtt: re-export shaders in a single bo

RGP expects a pipeline's shaders to be all stored sequentially, eg: [vs][ps][gs] As such, it assumes a single bo is dumped to the .rgp file, with the following info: * va of the bo * offset to each shader inside the bo For radeonsi, the shaders are stored individually, so we may have a big gap between the shaders forming a pipeline => we can produce very large file because the layout in the file must match the one in memory (see the warning in ac_rgp_file_write_elf_text). This commit implements a workaround: gfx shaders are re-exported as a pipeline. To update the shader address, a new state is created (sqtt_pipeline), which will overwrite the needed _PGM_LO_* registers. This reduces DeuxEX rgp captures from 150GB+ to less than 100MB. Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18865>
2025-12-22 07:00:12 +01:00 · 2022-09-26 14:34:03 +02:00 · 2022-09-26 14:34:03 +02:00 · 8034a71430
commit 8034a71430
parent 5a9a43c8f9
10 changed files with 151 additions and 23 deletions
--- a/src/amd/common/ac_sqtt.h
+++ b/src/amd/common/ac_sqtt.h
@ -53,6 +53,8 @@ struct ac_thread_trace_data {
   struct rgp_queue_event rgp_queue_event;

   struct rgp_clock_calibration rgp_clock_calibration;
+
+   struct hash_table_u64 *pipeline_bos;
 };

 #define SQTT_BUFFER_ALIGN_SHIFT 12
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@ -320,11 +320,18 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state)
         program->shader.binary.elf_buffer,
         program->shader.binary.elf_size,
         0);
-      uint64_t base_address = program->shader.bo->gpu_address;

      struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
      if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
-         si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, true);
+         /* Short lived fake pipeline: we don't need to reupload the compute shaders,
+          * as we do for the gfx ones so just create a temp pipeline to be able to
+          * call si_sqtt_register_pipeline, and then drop it.
+          */
+         struct si_sqtt_fake_pipeline pipeline = { 0 };
+         pipeline.code_hash = pipeline_code_hash;
+         pipeline.bo = program->shader.bo;
+
+         si_sqtt_register_pipeline(sctx, &pipeline, true);
      }

      si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 1);
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -912,6 +912,13 @@ struct si_saved_cs {
   int64_t time_flush;
 };

+struct si_sqtt_fake_pipeline {
+   struct si_pm4_state pm4; /* base class */
+   uint64_t code_hash;
+   struct si_resource *bo;
+   uint32_t offset[SI_NUM_GRAPHICS_SHADERS];
+};
+
 struct si_small_prim_cull_info {
   float scale[2], translate[2];
   float scale_no_aa[2], translate_no_aa[2];
@ -1646,7 +1653,7 @@ void si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *r
                                uint32_t vertex_offset_user_data,
                                uint32_t instance_offset_user_data,
                                uint32_t draw_index_user_data);
-bool si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute);
+bool si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute);
 bool si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
                                    uint64_t pipeline_hash);
 void si_sqtt_describe_pipeline_bind(struct si_context* sctx, uint64_t pipeline_hash, int bind_point);
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@ -55,6 +55,9 @@ struct si_pm4_state {
   /* commands for the DE */
   uint16_t max_dw;

+   /* Used by SQTT to override the shader address */
+   uint16_t reg_va_low_idx;
+
   /* This must be the last field because the array can continue after the structure. */
   uint32_t pm4[64];
 };
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -817,8 +817,8 @@ static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage st
          screen->info.gfx_level >= GFX7 ? 512 : 256;
 }

-static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
-                                  struct ac_rtld_binary *rtld)
+bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
+                           struct ac_rtld_binary *rtld)
 {
   const struct si_shader_selector *sel = shader->selector;
   const char *part_elfs[5];
@ -889,8 +889,8 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh
   return size;
 }

-static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
-                                   uint64_t *value)
+bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
+                            uint64_t *value)
 {
   uint64_t *scratch_va = data;

--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -960,6 +960,8 @@ struct si_shader_part {
 };

 /* si_shader.c */
+struct ac_rtld_binary;
+
 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir);
 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                       struct si_shader *shader, struct util_debug_callback *debug);
@ -979,6 +981,10 @@ const char *si_get_shader_name(const struct si_shader *shader);
 void si_shader_binary_clean(struct si_shader_binary *binary);
 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
 unsigned si_get_ps_num_interp(struct si_shader *ps);
+bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
+                           struct ac_rtld_binary *rtld);
+bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
+                            uint64_t *value);

 /* si_shader_info.c */
 void si_nir_scan_shader(struct si_screen *sscreen,  const struct nir_shader *nir,
--- a/src/gallium/drivers/radeonsi/si_sqtt.c
+++ b/src/gallium/drivers/radeonsi/si_sqtt.c
@ -24,6 +24,7 @@
 */


+#include "hash_table.h"
 #include "si_pipe.h"
 #include "si_build_pm4.h"
 #include "si_compute.h"
@ -55,6 +56,8 @@ si_thread_trace_init_bo(struct si_context *sctx)
                  1 << SQTT_BUFFER_ALIGN_SHIFT);
   size += sctx->thread_trace->buffer_size * (uint64_t)max_se;

+   sctx->thread_trace->pipeline_bos = _mesa_hash_table_u64_create(NULL);
+
   sctx->thread_trace->bo =
      ws->buffer_create(ws, size, 4096,
                        RADEON_DOMAIN_VRAM,
@ -697,6 +700,12 @@ si_destroy_thread_trace(struct si_context *sctx)
   }
   simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock);

+   hash_table_foreach(sctx->thread_trace->pipeline_bos->table, entry) {
+      struct si_sqtt_fake_pipeline *pipeline = (struct si_sqtt_fake_pipeline *)entry->data;
+      si_resource_reference(&pipeline->bo, NULL);
+      FREE(pipeline);
+   }
+
   free(sctx->thread_trace);
   sctx->thread_trace = NULL;

@ -1010,7 +1019,7 @@ si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type

 static bool
 si_sqtt_add_code_object(struct si_context* sctx,
-                        uint64_t pipeline_hash,
+                        struct si_sqtt_fake_pipeline *pipeline,
                        bool is_compute)
 {
   struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
@ -1023,8 +1032,8 @@ si_sqtt_add_code_object(struct si_context* sctx,

   record->shader_stages_mask = 0;
   record->num_shaders_combined = 0;
-   record->pipeline_hash[0] = pipeline_hash;
-   record->pipeline_hash[1] = pipeline_hash;
+   record->pipeline_hash[0] = pipeline->code_hash;
+   record->pipeline_hash[1] = pipeline->code_hash;

   for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
      struct si_shader *shader;
@ -1051,7 +1060,7 @@ si_sqtt_add_code_object(struct si_context* sctx,
      }
      memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);

-      uint64_t va = shader->bo->gpu_address;
+      uint64_t va = pipeline->bo->gpu_address + pipeline->offset[i];
      unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i);
      record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
      record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0];
@ -1079,21 +1088,21 @@ si_sqtt_add_code_object(struct si_context* sctx,
 }

 bool
-si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute)
+si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute)
 {
   struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;

-   assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash));
+   assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline->code_hash));

-   bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash);
+   bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline->code_hash);
   if (!result)
      return false;

-   result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address);
+   result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline->code_hash, pipeline->bo->gpu_address);
   if (!result)
      return false;

-   return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute);
+   return si_sqtt_add_code_object(sctx, pipeline, is_compute);
 }

 void
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@ -187,6 +187,7 @@ union si_state {
      struct si_pm4_state *vgt_shader_config;
      struct si_shader *vs;
      struct si_shader *ps;
+      struct si_sqtt_fake_pipeline *sqtt_pipeline;
   } named;
   struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)];
 };
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@ -29,6 +29,8 @@
 #include "util/u_index_modify.h"
 #include "util/u_prim.h"
 #include "util/u_upload_mgr.h"
+#include "ac_rtld.h"
+#include "si_build_pm4.h"

 #if (GFX_VER == 6)
 #define GFX(name) name##GFX6
@ -303,28 +305,107 @@ static bool si_update_shaders(struct si_context *sctx)
   }

   if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) {
-      /* Pretend the bound shaders form a vk pipeline */
-      uint32_t pipeline_code_hash = 0;
-      uint64_t base_address = ~0;
+      /* Pretend the bound shaders form a vk pipeline. Include the scratch size in
+       * the hash calculation to force re-emitting the pipeline if the scratch bo
+       * changes.
+       */
+      uint64_t scratch_bo_size = sctx->scratch_buffer ? sctx->scratch_buffer->bo_size : 0;
+      uint64_t pipeline_code_hash = scratch_bo_size;
+      uint32_t total_size = 0;

+      /* Compute pipeline code hash. */
      for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
         struct si_shader *shader = sctx->shaders[i].current;
         if (sctx->shaders[i].cso && shader) {
-            pipeline_code_hash = _mesa_hash_data_with_seed(
+            pipeline_code_hash = XXH64(
               shader->binary.elf_buffer,
               shader->binary.elf_size,
               pipeline_code_hash);
-            base_address = MIN2(base_address,
-                                shader->bo->gpu_address);
+
+            total_size += ALIGN(shader->binary.uploaded_code_size, 256);
         }
      }

+      struct si_sqtt_fake_pipeline *pipeline = NULL;
      struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
      if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
-         si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false);
+         /* This is a new pipeline. Allocate a new bo to hold all the shaders. Without
+          * this, shader code export process creates huge rgp files because RGP assumes
+          * the shaders live sequentially in memory (shader N address = shader 0 + offset N)
+          */
+         struct si_resource *bo = si_aligned_buffer_create(
+            &sctx->screen->b,
+            (sctx->screen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
+            SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
+            PIPE_USAGE_IMMUTABLE, align(total_size, SI_CPDMA_ALIGNMENT), 256);
+
+         char *ptr = (char *) (bo ? sctx->screen->ws->buffer_map(sctx->screen->ws,
+               bo->buf, NULL,
+               (enum pipe_map_flags)(PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY)) :
+             NULL);
+
+         uint32_t offset = 0;
+         uint64_t scratch_va = sctx->scratch_buffer ? sctx->scratch_buffer->gpu_address : 0;
+
+         if (ptr) {
+            pipeline = (struct si_sqtt_fake_pipeline *)
+               CALLOC(1, sizeof(struct si_sqtt_fake_pipeline));
+            pipeline->code_hash = pipeline_code_hash;
+            si_resource_reference(&pipeline->bo, bo);
+
+            /* Re-upload all gfx shaders and init PM4. */
+            si_pm4_clear_state(&pipeline->pm4);
+
+            for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
+               struct si_shader *shader = sctx->shaders[i].current;
+               if (sctx->shaders[i].cso && shader) {
+                  struct ac_rtld_binary binary;
+                  si_shader_binary_open(sctx->screen, shader, &binary);
+
+                  struct ac_rtld_upload_info u = {};
+                  u.binary = &binary;
+                  u.get_external_symbol = si_get_external_symbol;
+                  u.cb_data = &scratch_va;
+                  u.rx_va = bo->gpu_address + offset;
+                  u.rx_ptr = ptr + offset;
+
+                  int size = ac_rtld_upload(&u);
+                  ac_rtld_close(&binary);
+
+                  pipeline->offset[i] = offset;
+
+                  offset += align(size, 256);
+
+                  struct si_pm4_state *pm4 = &shader->pm4;
+
+                  uint32_t va_low = (pipeline->bo->gpu_address + pipeline->offset[i]) >> 8;
+                  assert(PKT3_IT_OPCODE_G(pm4->pm4[pm4->reg_va_low_idx - 2]) == PKT3_SET_SH_REG);
+                  uint32_t reg = (pm4->pm4[pm4->reg_va_low_idx - 1] << 2) + SI_SH_REG_OFFSET;
+                  si_pm4_set_reg(&pipeline->pm4, reg, va_low);
+               }
+            }
+            sctx->screen->ws->buffer_unmap(sctx->screen->ws, bo->buf);
+
+            _mesa_hash_table_u64_insert(sctx->thread_trace->pipeline_bos,
+                                        pipeline_code_hash, pipeline);
+
+            si_sqtt_register_pipeline(sctx, pipeline, false);
+         } else {
+            if (bo)
+               si_resource_reference(&bo, NULL);
+         }
+      } else {
+         pipeline = (struct si_sqtt_fake_pipeline *)
+            _mesa_hash_table_u64_search(sctx->thread_trace->pipeline_bos, pipeline_code_hash);
      }
+      assert(pipeline);
+
+      pipeline->code_hash = pipeline_code_hash;
+      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, pipeline->bo,
+                                RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);

      si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0);
+      si_pm4_bind_state(sctx, sqtt_pipeline, pipeline);
   }

   if ((GFX_VERSION <= GFX8 &&
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@ -695,6 +695,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)

   va = shader->bo->gpu_address;
   si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;

   shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
                          S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
@ -729,6 +730,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
      } else {
         si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
      }
+      pm4->reg_va_low_idx = pm4->ndw - 1;

      unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);

@ -741,6 +743,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
         shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
   } else {
      si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+      pm4->reg_va_low_idx = pm4->ndw - 1;
      si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS,
                     S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8));

@ -816,6 +819,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
   oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;

   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;
   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
                  S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
   si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
@ -1104,6 +1108,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
      } else {
         si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
      }
+      pm4->reg_va_low_idx = pm4->ndw - 1;

      uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
                       S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
@ -1149,6 +1154,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
                                                   S_00B21C_WAVE_LIMIT(0x3F);

      si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+      pm4->reg_va_low_idx = pm4->ndw - 1;
      si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
                     S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));

@ -1421,6 +1427,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                         &late_alloc_wave64, &cu_mask);

   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;
   si_pm4_set_reg(
      pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
      S_00B228_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) |
@ -1733,6 +1740,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
   }

   si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;
   si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS,
                  S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));

@ -2008,6 +2016,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)

   va = shader->bo->gpu_address;
   si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;
   si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS,
                  S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));

@ -2078,6 +2087,9 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader
   default:
      assert(0);
   }
+
+   if (unlikely(sscreen->debug_flags & DBG(SQTT)))
+      assert(shader->pm4.reg_va_low_idx != 0);
 }

 static void si_clear_vs_key_inputs(struct si_context *sctx, union si_shader_key *key,