From 8034a71430be0b6473449028d90937729b77d6d9 Mon Sep 17 00:00:00 2001
From: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Date: Mon, 26 Sep 2022 14:34:03 +0200
Subject: [PATCH] radeonsi/sqtt: re-export shaders in a single bo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RGP expects a pipeline's shaders to be all stored sequentially, eg:

  [vs][ps][gs]

As such, it assumes a single bo is dumped to the .rgp file, with
the following info:
  * va of the bo
  * offset to each shader inside the bo

For radeonsi, the shaders are stored individually, so we may have
a big gap between the shaders forming a pipeline => we can produce
very large file because the layout in the file must match the one
in memory (see the warning in ac_rgp_file_write_elf_text).

This commit implements a workaround: gfx shaders are re-exported as a
pipeline.

To update the shader address, a new state is created (sqtt_pipeline),
which will overwrite the needed _PGM_LO_* registers.

This reduces DeuxEX rgp captures from 150GB+ to less than 100MB.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18865>
---
 src/amd/common/ac_sqtt.h                      |  2 +
 src/gallium/drivers/radeonsi/si_compute.c     | 11 ++-
 src/gallium/drivers/radeonsi/si_pipe.h        |  9 +-
 src/gallium/drivers/radeonsi/si_pm4.h         |  3 +
 src/gallium/drivers/radeonsi/si_shader.c      |  8 +-
 src/gallium/drivers/radeonsi/si_shader.h      |  6 ++
 src/gallium/drivers/radeonsi/si_sqtt.c        | 27 ++++--
 src/gallium/drivers/radeonsi/si_state.h       |  1 +
 .../drivers/radeonsi/si_state_draw.cpp        | 95 +++++++++++++++++--
 .../drivers/radeonsi/si_state_shaders.cpp     | 12 +++
 10 files changed, 151 insertions(+), 23 deletions(-)

diff --git a/src/amd/common/ac_sqtt.h b/src/amd/common/ac_sqtt.h
index 6c1709a5730..b9aadbf0735 100644
--- a/src/amd/common/ac_sqtt.h
+++ b/src/amd/common/ac_sqtt.h
@@ -53,6 +53,8 @@ struct ac_thread_trace_data {
    struct rgp_queue_event rgp_queue_event;
 
    struct rgp_clock_calibration rgp_clock_calibration;
+
+   struct hash_table_u64 *pipeline_bos;
 };
 
 #define SQTT_BUFFER_ALIGN_SHIFT 12
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 3e09dda62bf..f59cf3aed81 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -320,11 +320,18 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state)
          program->shader.binary.elf_buffer,
          program->shader.binary.elf_size,
          0);
-      uint64_t base_address = program->shader.bo->gpu_address;
 
       struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
       if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
-         si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, true);
+         /* Short lived fake pipeline: we don't need to reupload the compute shaders,
+          * as we do for the gfx ones so just create a temp pipeline to be able to
+          * call si_sqtt_register_pipeline, and then drop it.
+          */
+         struct si_sqtt_fake_pipeline pipeline = { 0 };
+         pipeline.code_hash = pipeline_code_hash;
+         pipeline.bo = program->shader.bo;
+
+         si_sqtt_register_pipeline(sctx, &pipeline, true);
       }
 
       si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 1);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 811f34075db..95e252187b4 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -912,6 +912,13 @@ struct si_saved_cs {
    int64_t time_flush;
 };
 
+struct si_sqtt_fake_pipeline {
+   struct si_pm4_state pm4; /* base class */
+   uint64_t code_hash;
+   struct si_resource *bo;
+   uint32_t offset[SI_NUM_GRAPHICS_SHADERS];
+};
+
 struct si_small_prim_cull_info {
    float scale[2], translate[2];
    float scale_no_aa[2], translate_no_aa[2];
@@ -1646,7 +1653,7 @@ void si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *r
                                 uint32_t vertex_offset_user_data,
                                 uint32_t instance_offset_user_data,
                                 uint32_t draw_index_user_data);
-bool si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute);
+bool si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute);
 bool si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
                                     uint64_t pipeline_hash);
 void si_sqtt_describe_pipeline_bind(struct si_context* sctx, uint64_t pipeline_hash, int bind_point);
diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h
index 1b657e75d89..4d1770a96d8 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_pm4.h
@@ -55,6 +55,9 @@ struct si_pm4_state {
    /* commands for the DE */
    uint16_t max_dw;
 
+   /* Used by SQTT to override the shader address */
+   uint16_t reg_va_low_idx;
+
    /* This must be the last field because the array can continue after the structure. */
    uint32_t pm4[64];
 };
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a9a534548ff..c9eb13927a7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -817,8 +817,8 @@ static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage st
           screen->info.gfx_level >= GFX7 ? 512 : 256;
 }
 
-static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
-                                  struct ac_rtld_binary *rtld)
+bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
+                           struct ac_rtld_binary *rtld)
 {
    const struct si_shader_selector *sel = shader->selector;
    const char *part_elfs[5];
@@ -889,8 +889,8 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh
    return size;
 }
 
-static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
-                                   uint64_t *value)
+bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
+                            uint64_t *value)
 {
    uint64_t *scratch_va = data;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 4ed1457f774..f8f2fe5519f 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -960,6 +960,8 @@ struct si_shader_part {
 };
 
 /* si_shader.c */
+struct ac_rtld_binary;
+
 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir);
 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                        struct si_shader *shader, struct util_debug_callback *debug);
@@ -979,6 +981,10 @@ const char *si_get_shader_name(const struct si_shader *shader);
 void si_shader_binary_clean(struct si_shader_binary *binary);
 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
 unsigned si_get_ps_num_interp(struct si_shader *ps);
+bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
+                           struct ac_rtld_binary *rtld);
+bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
+                            uint64_t *value);
 
 /* si_shader_info.c */
 void si_nir_scan_shader(struct si_screen *sscreen,  const struct nir_shader *nir,
diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c
index 5e4e03c2bfa..5ce3c710cd6 100644
--- a/src/gallium/drivers/radeonsi/si_sqtt.c
+++ b/src/gallium/drivers/radeonsi/si_sqtt.c
@@ -24,6 +24,7 @@
  */
 
 
+#include "hash_table.h"
 #include "si_pipe.h"
 #include "si_build_pm4.h"
 #include "si_compute.h"
@@ -55,6 +56,8 @@ si_thread_trace_init_bo(struct si_context *sctx)
                   1 << SQTT_BUFFER_ALIGN_SHIFT);
    size += sctx->thread_trace->buffer_size * (uint64_t)max_se;
 
+   sctx->thread_trace->pipeline_bos = _mesa_hash_table_u64_create(NULL);
+
    sctx->thread_trace->bo =
       ws->buffer_create(ws, size, 4096,
                         RADEON_DOMAIN_VRAM,
@@ -697,6 +700,12 @@ si_destroy_thread_trace(struct si_context *sctx)
    }
    simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock);
 
+   hash_table_foreach(sctx->thread_trace->pipeline_bos->table, entry) {
+      struct si_sqtt_fake_pipeline *pipeline = (struct si_sqtt_fake_pipeline *)entry->data;
+      si_resource_reference(&pipeline->bo, NULL);
+      FREE(pipeline);
+   }
+
    free(sctx->thread_trace);
    sctx->thread_trace = NULL;
 
@@ -1010,7 +1019,7 @@ si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type
 
 static bool
 si_sqtt_add_code_object(struct si_context* sctx,
-                        uint64_t pipeline_hash,
+                        struct si_sqtt_fake_pipeline *pipeline,
                         bool is_compute)
 {
    struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
@@ -1023,8 +1032,8 @@ si_sqtt_add_code_object(struct si_context* sctx,
 
    record->shader_stages_mask = 0;
    record->num_shaders_combined = 0;
-   record->pipeline_hash[0] = pipeline_hash;
-   record->pipeline_hash[1] = pipeline_hash;
+   record->pipeline_hash[0] = pipeline->code_hash;
+   record->pipeline_hash[1] = pipeline->code_hash;
 
    for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
       struct si_shader *shader;
@@ -1051,7 +1060,7 @@ si_sqtt_add_code_object(struct si_context* sctx,
       }
       memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
 
-      uint64_t va = shader->bo->gpu_address;
+      uint64_t va = pipeline->bo->gpu_address + pipeline->offset[i];
       unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i);
       record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
       record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0];
@@ -1079,21 +1088,21 @@ si_sqtt_add_code_object(struct si_context* sctx,
 }
 
 bool
-si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute)
+si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute)
 {
    struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
 
-   assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash));
+   assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline->code_hash));
 
-   bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash);
+   bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline->code_hash);
    if (!result)
       return false;
 
-   result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address);
+   result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline->code_hash, pipeline->bo->gpu_address);
    if (!result)
       return false;
 
-   return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute);
+   return si_sqtt_add_code_object(sctx, pipeline, is_compute);
 }
 
 void
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index fa60be47933..8e2eb8dd744 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -187,6 +187,7 @@ union si_state {
       struct si_pm4_state *vgt_shader_config;
       struct si_shader *vs;
       struct si_shader *ps;
+      struct si_sqtt_fake_pipeline *sqtt_pipeline;
    } named;
    struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)];
 };
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 0a6f2f3bd8f..afc68b4c7fa 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -29,6 +29,8 @@
 #include "util/u_index_modify.h"
 #include "util/u_prim.h"
 #include "util/u_upload_mgr.h"
+#include "ac_rtld.h"
+#include "si_build_pm4.h"
 
 #if (GFX_VER == 6)
 #define GFX(name) name##GFX6
@@ -303,28 +305,107 @@ static bool si_update_shaders(struct si_context *sctx)
    }
 
    if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) {
-      /* Pretend the bound shaders form a vk pipeline */
-      uint32_t pipeline_code_hash = 0;
-      uint64_t base_address = ~0;
+      /* Pretend the bound shaders form a vk pipeline. Include the scratch size in
+       * the hash calculation to force re-emitting the pipeline if the scratch bo
+       * changes.
+       */
+      uint64_t scratch_bo_size = sctx->scratch_buffer ? sctx->scratch_buffer->bo_size : 0;
+      uint64_t pipeline_code_hash = scratch_bo_size;
+      uint32_t total_size = 0;
 
+      /* Compute pipeline code hash. */
       for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
          struct si_shader *shader = sctx->shaders[i].current;
          if (sctx->shaders[i].cso && shader) {
-            pipeline_code_hash = _mesa_hash_data_with_seed(
+            pipeline_code_hash = XXH64(
                shader->binary.elf_buffer,
                shader->binary.elf_size,
                pipeline_code_hash);
-            base_address = MIN2(base_address,
-                                shader->bo->gpu_address);
+
+            total_size += ALIGN(shader->binary.uploaded_code_size, 256);
          }
       }
 
+      struct si_sqtt_fake_pipeline *pipeline = NULL;
       struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
       if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
-         si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false);
+         /* This is a new pipeline. Allocate a new bo to hold all the shaders. Without
+          * this, shader code export process creates huge rgp files because RGP assumes
+          * the shaders live sequentially in memory (shader N address = shader 0 + offset N)
+          */
+         struct si_resource *bo = si_aligned_buffer_create(
+            &sctx->screen->b,
+            (sctx->screen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
+            SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
+            PIPE_USAGE_IMMUTABLE, align(total_size, SI_CPDMA_ALIGNMENT), 256);
+
+         char *ptr = (char *) (bo ? sctx->screen->ws->buffer_map(sctx->screen->ws,
+               bo->buf, NULL,
+               (enum pipe_map_flags)(PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY)) :
+             NULL);
+
+         uint32_t offset = 0;
+         uint64_t scratch_va = sctx->scratch_buffer ? sctx->scratch_buffer->gpu_address : 0;
+
+         if (ptr) {
+            pipeline = (struct si_sqtt_fake_pipeline *)
+               CALLOC(1, sizeof(struct si_sqtt_fake_pipeline));
+            pipeline->code_hash = pipeline_code_hash;
+            si_resource_reference(&pipeline->bo, bo);
+
+            /* Re-upload all gfx shaders and init PM4. */
+            si_pm4_clear_state(&pipeline->pm4);
+
+            for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
+               struct si_shader *shader = sctx->shaders[i].current;
+               if (sctx->shaders[i].cso && shader) {
+                  struct ac_rtld_binary binary;
+                  si_shader_binary_open(sctx->screen, shader, &binary);
+
+                  struct ac_rtld_upload_info u = {};
+                  u.binary = &binary;
+                  u.get_external_symbol = si_get_external_symbol;
+                  u.cb_data = &scratch_va;
+                  u.rx_va = bo->gpu_address + offset;
+                  u.rx_ptr = ptr + offset;
+
+                  int size = ac_rtld_upload(&u);
+                  ac_rtld_close(&binary);
+
+                  pipeline->offset[i] = offset;
+
+                  offset += align(size, 256);
+
+                  struct si_pm4_state *pm4 = &shader->pm4;
+
+                  uint32_t va_low = (pipeline->bo->gpu_address + pipeline->offset[i]) >> 8;
+                  assert(PKT3_IT_OPCODE_G(pm4->pm4[pm4->reg_va_low_idx - 2]) == PKT3_SET_SH_REG);
+                  uint32_t reg = (pm4->pm4[pm4->reg_va_low_idx - 1] << 2) + SI_SH_REG_OFFSET;
+                  si_pm4_set_reg(&pipeline->pm4, reg, va_low);
+               }
+            }
+            sctx->screen->ws->buffer_unmap(sctx->screen->ws, bo->buf);
+
+            _mesa_hash_table_u64_insert(sctx->thread_trace->pipeline_bos,
+                                        pipeline_code_hash, pipeline);
+
+            si_sqtt_register_pipeline(sctx, pipeline, false);
+         } else {
+            if (bo)
+               si_resource_reference(&bo, NULL);
+         }
+      } else {
+         pipeline = (struct si_sqtt_fake_pipeline *)
+            _mesa_hash_table_u64_search(sctx->thread_trace->pipeline_bos, pipeline_code_hash);
       }
+      assert(pipeline);
+
+      pipeline->code_hash = pipeline_code_hash;
+      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, pipeline->bo,
+                                RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);
 
       si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0);
+      si_pm4_bind_state(sctx, sqtt_pipeline, pipeline);
    }
 
    if ((GFX_VERSION <= GFX8 &&
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
index fb301bdc403..c2c09185f8a 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@@ -695,6 +695,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
 
    va = shader->bo->gpu_address;
    si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;
 
    shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
                           S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
@@ -729,6 +730,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
       } else {
          si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
       }
+      pm4->reg_va_low_idx = pm4->ndw - 1;
 
       unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
 
@@ -741,6 +743,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
          shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
    } else {
       si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+      pm4->reg_va_low_idx = pm4->ndw - 1;
       si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS,
                      S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8));
 
@@ -816,6 +819,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
    oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
 
    si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;
    si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
                   S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
    si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
@@ -1104,6 +1108,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
       } else {
          si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
       }
+      pm4->reg_va_low_idx = pm4->ndw - 1;
 
       uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
                        S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
@@ -1149,6 +1154,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
                                                    S_00B21C_WAVE_LIMIT(0x3F);
 
       si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+      pm4->reg_va_low_idx = pm4->ndw - 1;
       si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
                      S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));
 
@@ -1421,6 +1427,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                          &late_alloc_wave64, &cu_mask);
 
    si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;
    si_pm4_set_reg(
       pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
       S_00B228_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) |
@@ -1733,6 +1740,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
    }
 
    si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;
    si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS,
                   S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));
 
@@ -2008,6 +2016,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
 
    va = shader->bo->gpu_address;
    si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
+   pm4->reg_va_low_idx = pm4->ndw - 1;
    si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS,
                   S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));
 
@@ -2078,6 +2087,9 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader
    default:
       assert(0);
    }
+
+   if (unlikely(sscreen->debug_flags & DBG(SQTT)))
+      assert(shader->pm4.reg_va_low_idx != 0);
 }
 
 static void si_clear_vs_key_inputs(struct si_context *sctx, union si_shader_key *key,