From 8034a71430be0b6473449028d90937729b77d6d9 Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Mon, 26 Sep 2022 14:34:03 +0200 Subject: [PATCH] radeonsi/sqtt: re-export shaders in a single bo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RGP expects a pipeline's shaders to be all stored sequentially, eg: [vs][ps][gs] As such, it assumes a single bo is dumped to the .rgp file, with the following info: * va of the bo * offset to each shader inside the bo For radeonsi, the shaders are stored individually, so we may have a big gap between the shaders forming a pipeline => we can produce very large file because the layout in the file must match the one in memory (see the warning in ac_rgp_file_write_elf_text). This commit implements a workaround: gfx shaders are re-exported as a pipeline. To update the shader address, a new state is created (sqtt_pipeline), which will overwrite the needed _PGM_LO_* registers. This reduces DeuxEX rgp captures from 150GB+ to less than 100MB. Reviewed-by: Marek Olšák Part-of: --- src/amd/common/ac_sqtt.h | 2 + src/gallium/drivers/radeonsi/si_compute.c | 11 ++- src/gallium/drivers/radeonsi/si_pipe.h | 9 +- src/gallium/drivers/radeonsi/si_pm4.h | 3 + src/gallium/drivers/radeonsi/si_shader.c | 8 +- src/gallium/drivers/radeonsi/si_shader.h | 6 ++ src/gallium/drivers/radeonsi/si_sqtt.c | 27 ++++-- src/gallium/drivers/radeonsi/si_state.h | 1 + .../drivers/radeonsi/si_state_draw.cpp | 95 +++++++++++++++++-- .../drivers/radeonsi/si_state_shaders.cpp | 12 +++ 10 files changed, 151 insertions(+), 23 deletions(-) diff --git a/src/amd/common/ac_sqtt.h b/src/amd/common/ac_sqtt.h index 6c1709a5730..b9aadbf0735 100644 --- a/src/amd/common/ac_sqtt.h +++ b/src/amd/common/ac_sqtt.h @@ -53,6 +53,8 @@ struct ac_thread_trace_data { struct rgp_queue_event rgp_queue_event; struct rgp_clock_calibration rgp_clock_calibration; + + struct hash_table_u64 *pipeline_bos; }; #define SQTT_BUFFER_ALIGN_SHIFT 12 diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 3e09dda62bf..f59cf3aed81 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -320,11 +320,18 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state) program->shader.binary.elf_buffer, program->shader.binary.elf_size, 0); - uint64_t base_address = program->shader.bo->gpu_address; struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { - si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, true); + /* Short lived fake pipeline: we don't need to reupload the compute shaders, + * as we do for the gfx ones so just create a temp pipeline to be able to + * call si_sqtt_register_pipeline, and then drop it. + */ + struct si_sqtt_fake_pipeline pipeline = { 0 }; + pipeline.code_hash = pipeline_code_hash; + pipeline.bo = program->shader.bo; + + si_sqtt_register_pipeline(sctx, &pipeline, true); } si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 1); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 811f34075db..95e252187b4 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -912,6 +912,13 @@ struct si_saved_cs { int64_t time_flush; }; +struct si_sqtt_fake_pipeline { + struct si_pm4_state pm4; /* base class */ + uint64_t code_hash; + struct si_resource *bo; + uint32_t offset[SI_NUM_GRAPHICS_SHADERS]; +}; + struct si_small_prim_cull_info { float scale[2], translate[2]; float scale_no_aa[2], translate_no_aa[2]; @@ -1646,7 +1653,7 @@ void si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *r uint32_t vertex_offset_user_data, uint32_t instance_offset_user_data, uint32_t draw_index_user_data); -bool si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute); +bool si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute); bool si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data, uint64_t pipeline_hash); void si_sqtt_describe_pipeline_bind(struct si_context* sctx, uint64_t pipeline_hash, int bind_point); diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h index 1b657e75d89..4d1770a96d8 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.h +++ b/src/gallium/drivers/radeonsi/si_pm4.h @@ -55,6 +55,9 @@ struct si_pm4_state { /* commands for the DE */ uint16_t max_dw; + /* Used by SQTT to override the shader address */ + uint16_t reg_va_low_idx; + /* This must be the last field because the array can continue after the structure. */ uint32_t pm4[64]; }; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a9a534548ff..c9eb13927a7 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -817,8 +817,8 @@ static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage st screen->info.gfx_level >= GFX7 ? 512 : 256; } -static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, - struct ac_rtld_binary *rtld) +bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, + struct ac_rtld_binary *rtld) { const struct si_shader_selector *sel = shader->selector; const char *part_elfs[5]; @@ -889,8 +889,8 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh return size; } -static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name, - uint64_t *value) +bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name, + uint64_t *value) { uint64_t *scratch_va = data; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 4ed1457f774..f8f2fe5519f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -960,6 +960,8 @@ struct si_shader_part { }; /* si_shader.c */ +struct ac_rtld_binary; + void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir); bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, struct util_debug_callback *debug); @@ -979,6 +981,10 @@ const char *si_get_shader_name(const struct si_shader *shader); void si_shader_binary_clean(struct si_shader_binary *binary); struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel); unsigned si_get_ps_num_interp(struct si_shader *ps); +bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, + struct ac_rtld_binary *rtld); +bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name, + uint64_t *value); /* si_shader_info.c */ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir, diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c index 5e4e03c2bfa..5ce3c710cd6 100644 --- a/src/gallium/drivers/radeonsi/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/si_sqtt.c @@ -24,6 +24,7 @@ */ +#include "hash_table.h" #include "si_pipe.h" #include "si_build_pm4.h" #include "si_compute.h" @@ -55,6 +56,8 @@ si_thread_trace_init_bo(struct si_context *sctx) 1 << SQTT_BUFFER_ALIGN_SHIFT); size += sctx->thread_trace->buffer_size * (uint64_t)max_se; + sctx->thread_trace->pipeline_bos = _mesa_hash_table_u64_create(NULL); + sctx->thread_trace->bo = ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_VRAM, @@ -697,6 +700,12 @@ si_destroy_thread_trace(struct si_context *sctx) } simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock); + hash_table_foreach(sctx->thread_trace->pipeline_bos->table, entry) { + struct si_sqtt_fake_pipeline *pipeline = (struct si_sqtt_fake_pipeline *)entry->data; + si_resource_reference(&pipeline->bo, NULL); + FREE(pipeline); + } + free(sctx->thread_trace); sctx->thread_trace = NULL; @@ -1010,7 +1019,7 @@ si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type static bool si_sqtt_add_code_object(struct si_context* sctx, - uint64_t pipeline_hash, + struct si_sqtt_fake_pipeline *pipeline, bool is_compute) { struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; @@ -1023,8 +1032,8 @@ si_sqtt_add_code_object(struct si_context* sctx, record->shader_stages_mask = 0; record->num_shaders_combined = 0; - record->pipeline_hash[0] = pipeline_hash; - record->pipeline_hash[1] = pipeline_hash; + record->pipeline_hash[0] = pipeline->code_hash; + record->pipeline_hash[1] = pipeline->code_hash; for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) { struct si_shader *shader; @@ -1051,7 +1060,7 @@ si_sqtt_add_code_object(struct si_context* sctx, } memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size); - uint64_t va = shader->bo->gpu_address; + uint64_t va = pipeline->bo->gpu_address + pipeline->offset[i]; unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i); record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size); record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0]; @@ -1079,21 +1088,21 @@ si_sqtt_add_code_object(struct si_context* sctx, } bool -si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute) +si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute) { struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; - assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash)); + assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline->code_hash)); - bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash); + bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline->code_hash); if (!result) return false; - result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address); + result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline->code_hash, pipeline->bo->gpu_address); if (!result) return false; - return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute); + return si_sqtt_add_code_object(sctx, pipeline, is_compute); } void diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index fa60be47933..8e2eb8dd744 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -187,6 +187,7 @@ union si_state { struct si_pm4_state *vgt_shader_config; struct si_shader *vs; struct si_shader *ps; + struct si_sqtt_fake_pipeline *sqtt_pipeline; } named; struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)]; }; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 0a6f2f3bd8f..afc68b4c7fa 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -29,6 +29,8 @@ #include "util/u_index_modify.h" #include "util/u_prim.h" #include "util/u_upload_mgr.h" +#include "ac_rtld.h" +#include "si_build_pm4.h" #if (GFX_VER == 6) #define GFX(name) name##GFX6 @@ -303,28 +305,107 @@ static bool si_update_shaders(struct si_context *sctx) } if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) { - /* Pretend the bound shaders form a vk pipeline */ - uint32_t pipeline_code_hash = 0; - uint64_t base_address = ~0; + /* Pretend the bound shaders form a vk pipeline. Include the scratch size in + * the hash calculation to force re-emitting the pipeline if the scratch bo + * changes. + */ + uint64_t scratch_bo_size = sctx->scratch_buffer ? sctx->scratch_buffer->bo_size : 0; + uint64_t pipeline_code_hash = scratch_bo_size; + uint32_t total_size = 0; + /* Compute pipeline code hash. */ for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { struct si_shader *shader = sctx->shaders[i].current; if (sctx->shaders[i].cso && shader) { - pipeline_code_hash = _mesa_hash_data_with_seed( + pipeline_code_hash = XXH64( shader->binary.elf_buffer, shader->binary.elf_size, pipeline_code_hash); - base_address = MIN2(base_address, - shader->bo->gpu_address); + + total_size += ALIGN(shader->binary.uploaded_code_size, 256); } } + struct si_sqtt_fake_pipeline *pipeline = NULL; struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { - si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false); + /* This is a new pipeline. Allocate a new bo to hold all the shaders. Without + * this, shader code export process creates huge rgp files because RGP assumes + * the shaders live sequentially in memory (shader N address = shader 0 + offset N) + */ + struct si_resource *bo = si_aligned_buffer_create( + &sctx->screen->b, + (sctx->screen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) | + SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT, + PIPE_USAGE_IMMUTABLE, align(total_size, SI_CPDMA_ALIGNMENT), 256); + + char *ptr = (char *) (bo ? sctx->screen->ws->buffer_map(sctx->screen->ws, + bo->buf, NULL, + (enum pipe_map_flags)(PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY)) : + NULL); + + uint32_t offset = 0; + uint64_t scratch_va = sctx->scratch_buffer ? sctx->scratch_buffer->gpu_address : 0; + + if (ptr) { + pipeline = (struct si_sqtt_fake_pipeline *) + CALLOC(1, sizeof(struct si_sqtt_fake_pipeline)); + pipeline->code_hash = pipeline_code_hash; + si_resource_reference(&pipeline->bo, bo); + + /* Re-upload all gfx shaders and init PM4. */ + si_pm4_clear_state(&pipeline->pm4); + + for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) { + struct si_shader *shader = sctx->shaders[i].current; + if (sctx->shaders[i].cso && shader) { + struct ac_rtld_binary binary; + si_shader_binary_open(sctx->screen, shader, &binary); + + struct ac_rtld_upload_info u = {}; + u.binary = &binary; + u.get_external_symbol = si_get_external_symbol; + u.cb_data = &scratch_va; + u.rx_va = bo->gpu_address + offset; + u.rx_ptr = ptr + offset; + + int size = ac_rtld_upload(&u); + ac_rtld_close(&binary); + + pipeline->offset[i] = offset; + + offset += align(size, 256); + + struct si_pm4_state *pm4 = &shader->pm4; + + uint32_t va_low = (pipeline->bo->gpu_address + pipeline->offset[i]) >> 8; + assert(PKT3_IT_OPCODE_G(pm4->pm4[pm4->reg_va_low_idx - 2]) == PKT3_SET_SH_REG); + uint32_t reg = (pm4->pm4[pm4->reg_va_low_idx - 1] << 2) + SI_SH_REG_OFFSET; + si_pm4_set_reg(&pipeline->pm4, reg, va_low); + } + } + sctx->screen->ws->buffer_unmap(sctx->screen->ws, bo->buf); + + _mesa_hash_table_u64_insert(sctx->thread_trace->pipeline_bos, + pipeline_code_hash, pipeline); + + si_sqtt_register_pipeline(sctx, pipeline, false); + } else { + if (bo) + si_resource_reference(&bo, NULL); + } + } else { + pipeline = (struct si_sqtt_fake_pipeline *) + _mesa_hash_table_u64_search(sctx->thread_trace->pipeline_bos, pipeline_code_hash); } + assert(pipeline); + + pipeline->code_hash = pipeline_code_hash; + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, pipeline->bo, + RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY); si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0); + si_pm4_bind_state(sctx, sqtt_pipeline, pipeline); } if ((GFX_VERSION <= GFX8 && diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index fb301bdc403..c2c09185f8a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -695,6 +695,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader) va = shader->bo->gpu_address; si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); + pm4->reg_va_low_idx = pm4->ndw - 1; shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) | @@ -729,6 +730,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) } else { si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8); } + pm4->reg_va_low_idx = pm4->ndw - 1; unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR); @@ -741,6 +743,7 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); } else { si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8); + pm4->reg_va_low_idx = pm4->ndw - 1; si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8)); @@ -816,6 +819,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) oc_lds_en = shader->selector->stage == MESA_SHADER_TESS_EVAL ? 1 : 0; si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); + pm4->reg_va_low_idx = pm4->ndw - 1; si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8)); si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, @@ -1104,6 +1108,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) } else { si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8); } + pm4->reg_va_low_idx = pm4->ndw - 1; uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) | S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) | @@ -1149,6 +1154,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) S_00B21C_WAVE_LIMIT(0x3F); si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); + pm4->reg_va_low_idx = pm4->ndw - 1; si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8)); @@ -1421,6 +1427,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader &late_alloc_wave64, &cu_mask); si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); + pm4->reg_va_low_idx = pm4->ndw - 1; si_pm4_set_reg( pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, S_00B228_VGPRS((shader->config.num_vgprs - 1) / (shader->wave_size == 32 ? 8 : 4)) | @@ -1733,6 +1740,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, } si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8); + pm4->reg_va_low_idx = pm4->ndw - 1; si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8)); @@ -2008,6 +2016,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) va = shader->bo->gpu_address; si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8); + pm4->reg_va_low_idx = pm4->ndw - 1; si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8)); @@ -2078,6 +2087,9 @@ static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader default: assert(0); } + + if (unlikely(sscreen->debug_flags & DBG(SQTT))) + assert(shader->pm4.reg_va_low_idx != 0); } static void si_clear_vs_key_inputs(struct si_context *sctx, union si_shader_key *key,