radeonsi/sqtt: re-export shaders in a single bo

RGP expects a pipeline's shaders to be all stored sequentially, eg:

  [vs][ps][gs]

As such, it assumes a single bo is dumped to the .rgp file, with
the following info:
  * va of the bo
  * offset to each shader inside the bo

For radeonsi, the shaders are stored individually, so we may have
a big gap between the shaders forming a pipeline => we can produce
very large file because the layout in the file must match the one
in memory (see the warning in ac_rgp_file_write_elf_text).

This commit implements a workaround: gfx shaders are re-exported as a
pipeline.

To update the shader address, a new state is created (sqtt_pipeline),
which will overwrite the needed _PGM_LO_* registers.

This reduces DeuxEX rgp captures from 150GB+ to less than 100MB.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18865>
This commit is contained in:
Pierre-Eric Pelloux-Prayer 2022-09-26 14:34:03 +02:00 committed by Marge Bot
parent 5a9a43c8f9
commit 8034a71430
10 changed files with 151 additions and 23 deletions

View file

@ -53,6 +53,8 @@ struct ac_thread_trace_data {
struct rgp_queue_event rgp_queue_event;
struct rgp_clock_calibration rgp_clock_calibration;
struct hash_table_u64 *pipeline_bos;
};
#define SQTT_BUFFER_ALIGN_SHIFT 12

View file

@ -320,11 +320,18 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state)
program->shader.binary.elf_buffer,
program->shader.binary.elf_size,
0);
uint64_t base_address = program->shader.bo->gpu_address;
struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, true);
/* Short lived fake pipeline: we don't need to reupload the compute shaders,
* as we do for the gfx ones so just create a temp pipeline to be able to
* call si_sqtt_register_pipeline, and then drop it.
*/
struct si_sqtt_fake_pipeline pipeline = { 0 };
pipeline.code_hash = pipeline_code_hash;
pipeline.bo = program->shader.bo;
si_sqtt_register_pipeline(sctx, &pipeline, true);
}
si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 1);

View file

@ -912,6 +912,13 @@ struct si_saved_cs {
int64_t time_flush;
};
struct si_sqtt_fake_pipeline {
struct si_pm4_state pm4; /* base class */
uint64_t code_hash;
struct si_resource *bo;
uint32_t offset[SI_NUM_GRAPHICS_SHADERS];
};
struct si_small_prim_cull_info {
float scale[2], translate[2];
float scale_no_aa[2], translate_no_aa[2];
@ -1646,7 +1653,7 @@ void si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *r
uint32_t vertex_offset_user_data,
uint32_t instance_offset_user_data,
uint32_t draw_index_user_data);
bool si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute);
bool si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute);
bool si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
uint64_t pipeline_hash);
void si_sqtt_describe_pipeline_bind(struct si_context* sctx, uint64_t pipeline_hash, int bind_point);

View file

@ -55,6 +55,9 @@ struct si_pm4_state {
/* commands for the DE */
uint16_t max_dw;
/* Used by SQTT to override the shader address */
uint16_t reg_va_low_idx;
/* This must be the last field because the array can continue after the structure. */
uint32_t pm4[64];
};

View file

@ -817,8 +817,8 @@ static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage st
screen->info.gfx_level >= GFX7 ? 512 : 256;
}
static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
struct ac_rtld_binary *rtld)
bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
struct ac_rtld_binary *rtld)
{
const struct si_shader_selector *sel = shader->selector;
const char *part_elfs[5];
@ -889,8 +889,8 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh
return size;
}
static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
uint64_t *value)
bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
uint64_t *value)
{
uint64_t *scratch_va = data;

View file

@ -960,6 +960,8 @@ struct si_shader_part {
};
/* si_shader.c */
struct ac_rtld_binary;
void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir);
bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
struct si_shader *shader, struct util_debug_callback *debug);
@ -979,6 +981,10 @@ const char *si_get_shader_name(const struct si_shader *shader);
void si_shader_binary_clean(struct si_shader_binary *binary);
struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel);
unsigned si_get_ps_num_interp(struct si_shader *ps);
bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
struct ac_rtld_binary *rtld);
bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
uint64_t *value);
/* si_shader_info.c */
void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,

View file

@ -24,6 +24,7 @@
*/
#include "hash_table.h"
#include "si_pipe.h"
#include "si_build_pm4.h"
#include "si_compute.h"
@ -55,6 +56,8 @@ si_thread_trace_init_bo(struct si_context *sctx)
1 << SQTT_BUFFER_ALIGN_SHIFT);
size += sctx->thread_trace->buffer_size * (uint64_t)max_se;
sctx->thread_trace->pipeline_bos = _mesa_hash_table_u64_create(NULL);
sctx->thread_trace->bo =
ws->buffer_create(ws, size, 4096,
RADEON_DOMAIN_VRAM,
@ -697,6 +700,12 @@ si_destroy_thread_trace(struct si_context *sctx)
}
simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock);
hash_table_foreach(sctx->thread_trace->pipeline_bos->table, entry) {
struct si_sqtt_fake_pipeline *pipeline = (struct si_sqtt_fake_pipeline *)entry->data;
si_resource_reference(&pipeline->bo, NULL);
FREE(pipeline);
}
free(sctx->thread_trace);
sctx->thread_trace = NULL;
@ -1010,7 +1019,7 @@ si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type
static bool
si_sqtt_add_code_object(struct si_context* sctx,
uint64_t pipeline_hash,
struct si_sqtt_fake_pipeline *pipeline,
bool is_compute)
{
struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
@ -1023,8 +1032,8 @@ si_sqtt_add_code_object(struct si_context* sctx,
record->shader_stages_mask = 0;
record->num_shaders_combined = 0;
record->pipeline_hash[0] = pipeline_hash;
record->pipeline_hash[1] = pipeline_hash;
record->pipeline_hash[0] = pipeline->code_hash;
record->pipeline_hash[1] = pipeline->code_hash;
for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
struct si_shader *shader;
@ -1051,7 +1060,7 @@ si_sqtt_add_code_object(struct si_context* sctx,
}
memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size);
uint64_t va = shader->bo->gpu_address;
uint64_t va = pipeline->bo->gpu_address + pipeline->offset[i];
unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i);
record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size);
record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0];
@ -1079,21 +1088,21 @@ si_sqtt_add_code_object(struct si_context* sctx,
}
bool
si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute)
si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute)
{
struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash));
assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline->code_hash));
bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash);
bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline->code_hash);
if (!result)
return false;
result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address);
result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline->code_hash, pipeline->bo->gpu_address);
if (!result)
return false;
return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute);
return si_sqtt_add_code_object(sctx, pipeline, is_compute);
}
void

View file

@ -187,6 +187,7 @@ union si_state {
struct si_pm4_state *vgt_shader_config;
struct si_shader *vs;
struct si_shader *ps;
struct si_sqtt_fake_pipeline *sqtt_pipeline;
} named;
struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)];
};

View file

@ -29,6 +29,8 @@
#include "util/u_index_modify.h"
#include "util/u_prim.h"
#include "util/u_upload_mgr.h"
#include "ac_rtld.h"
#include "si_build_pm4.h"
#if (GFX_VER == 6)
#define GFX(name) name##GFX6
@ -303,28 +305,107 @@ static bool si_update_shaders(struct si_context *sctx)
}
if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) {
/* Pretend the bound shaders form a vk pipeline */
uint32_t pipeline_code_hash = 0;
uint64_t base_address = ~0;
/* Pretend the bound shaders form a vk pipeline. Include the scratch size in
* the hash calculation to force re-emitting the pipeline if the scratch bo
* changes.
*/
uint64_t scratch_bo_size = sctx->scratch_buffer ? sctx->scratch_buffer->bo_size : 0;
uint64_t pipeline_code_hash = scratch_bo_size;
uint32_t total_size = 0;
/* Compute pipeline code hash. */
for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
struct si_shader *shader = sctx->shaders[i].current;
if (sctx->shaders[i].cso && shader) {
pipeline_code_hash = _mesa_hash_data_with_seed(
pipeline_code_hash = XXH64(
shader->binary.elf_buffer,
shader->binary.elf_size,
pipeline_code_hash);
base_address = MIN2(base_address,
shader->bo->gpu_address);
total_size += ALIGN(shader->binary.uploaded_code_size, 256);
}
}
struct si_sqtt_fake_pipeline *pipeline = NULL;
struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false);
/* This is a new pipeline. Allocate a new bo to hold all the shaders. Without
* this, shader code export process creates huge rgp files because RGP assumes
* the shaders live sequentially in memory (shader N address = shader 0 + offset N)
*/
struct si_resource *bo = si_aligned_buffer_create(
&sctx->screen->b,
(sctx->screen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
PIPE_USAGE_IMMUTABLE, align(total_size, SI_CPDMA_ALIGNMENT), 256);
char *ptr = (char *) (bo ? sctx->screen->ws->buffer_map(sctx->screen->ws,
bo->buf, NULL,
(enum pipe_map_flags)(PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY)) :
NULL);
uint32_t offset = 0;
uint64_t scratch_va = sctx->scratch_buffer ? sctx->scratch_buffer->gpu_address : 0;
if (ptr) {
pipeline = (struct si_sqtt_fake_pipeline *)
CALLOC(1, sizeof(struct si_sqtt_fake_pipeline));
pipeline->code_hash = pipeline_code_hash;
si_resource_reference(&pipeline->bo, bo);
/* Re-upload all gfx shaders and init PM4. */
si_pm4_clear_state(&pipeline->pm4);
for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
struct si_shader *shader = sctx->shaders[i].current;
if (sctx->shaders[i].cso && shader) {
struct ac_rtld_binary binary;
si_shader_binary_open(sctx->screen, shader, &binary);
struct ac_rtld_upload_info u = {};
u.binary = &binary;
u.get_external_symbol = si_get_external_symbol;
u.cb_data = &scratch_va;
u.rx_va = bo->gpu_address + offset;
u.rx_ptr = ptr + offset;
int size = ac_rtld_upload(&u);
ac_rtld_close(&binary);
pipeline->offset[i] = offset;
offset += align(size, 256);
struct si_pm4_state *pm4 = &shader->pm4;
uint32_t va_low = (pipeline->bo->gpu_address + pipeline->offset[i]) >> 8;
assert(PKT3_IT_OPCODE_G(pm4->pm4[pm4->reg_va_low_idx - 2]) == PKT3_SET_SH_REG);
uint32_t reg = (pm4->pm4[pm4->reg_va_low_idx - 1] << 2) + SI_SH_REG_OFFSET;
si_pm4_set_reg(&pipeline->pm4, reg, va_low);
}
}
sctx->screen->ws->buffer_unmap(sctx->screen->ws, bo->buf);
_mesa_hash_table_u64_insert(sctx->thread_trace->pipeline_bos,
pipeline_code_hash, pipeline);
si_sqtt_register_pipeline(sctx, pipeline, false);
} else {
if (bo)
si_resource_reference(&bo, NULL);
}
} else {
pipeline = (struct si_sqtt_fake_pipeline *)
_mesa_hash_table_u64_search(sctx->thread_trace->pipeline_bos, pipeline_code_hash);
}
assert(pipeline);
pipeline->code_hash = pipeline_code_hash;
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, pipeline->bo,
RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);
si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0);
si_pm4_bind_state(sctx, sqtt_pipeline, pipeline);
}
if ((GFX_VERSION <= GFX8 &&

View file

@ -695,6 +695,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
va = shader->bo->gpu_address;
si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
pm4->reg_va_low_idx = pm4->ndw - 1;
shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
@ -729,6 +730,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
} else {
si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
}
pm4->reg_va_low_idx = pm4->ndw - 1;
unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
@ -741,6 +743,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
} else {
si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
pm4->reg_va_low_idx = pm4->ndw - 1;
si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS,
S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8));
@ -816,6 +819,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
pm4->reg_va_low_idx = pm4->ndw - 1;
si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
@ -1104,6 +1108,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
} else {
si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
}
pm4->reg_va_low_idx = pm4->ndw - 1;
uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
@ -1149,6 +1154,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
S_00B21C_WAVE_LIMIT(0x3F);
si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
pm4->reg_va_low_idx = pm4->ndw - 1;
si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));
@ -1421,6 +1427,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
&late_alloc_wave64, &cu_mask);
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
pm4->reg_va_low_idx = pm4->ndw - 1;
si_pm4_set_reg(
pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
S_00B228_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) |
@ -1733,6 +1740,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
}
si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
pm4->reg_va_low_idx = pm4->ndw - 1;
si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS,
S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));
@ -2008,6 +2016,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
va = shader->bo->gpu_address;
si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
pm4->reg_va_low_idx = pm4->ndw - 1;
si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS,
S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));
@ -2078,6 +2087,9 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader
default:
assert(0);
}
if (unlikely(sscreen->debug_flags & DBG(SQTT)))
assert(shader->pm4.reg_va_low_idx != 0);
}
static void si_clear_vs_key_inputs(struct si_context *sctx, union si_shader_key *key,