From 8fcb4aa0ebd7b9d0d8f80986fb817afea2fc4a87 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 15 Aug 2022 11:59:03 +0200 Subject: [PATCH] radv: compact MRTs to save PS export memory space If there are holes between color outputs (e.g. a shader exports MRT1, but not MRT0), we can remove the holes by moving higher MRTs lower. The hardware will remap the MRTs to their correct locations if we remove holes in SPI_SHADER_COL_FORMAT but not CB_SHADER_MASK. This is good for performance because the hardware will allocate less space for color MRTs. This also allows to remove even more unused color exports because we no longer need to force previous targets to be non-zero. Only SotTR seems affected from our fossils db. fossils-db (NAVI21): Totals from 859 (0.64% of 134913) affected shaders: VGPRs: 24328 -> 24216 (-0.46%) CodeSize: 1433276 -> 1422576 (-0.75%) Instrs: 255275 -> 253728 (-0.61%) Latency: 1666836 -> 1661544 (-0.32%) InvThroughput: 346038 -> 343406 (-0.76%) Copies: 16520 -> 16506 (-0.08%) PreSGPRs: 25934 -> 25920 (-0.05%) PreVGPRs: 19903 -> 19662 (-1.21%) Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- .../compiler/aco_instruction_selection.cpp | 19 +++++--- src/amd/vulkan/radv_nir_to_llvm.c | 18 ++++---- src/amd/vulkan/radv_pipeline.c | 44 ++++++++++++++----- 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index cfb20a4a8b1..8904b21f7d5 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11310,6 +11310,8 @@ create_fs_exports(isel_context* ctx) if (ctx->program->info.ps.has_epilog) { create_fs_jump_to_epilog(ctx); } else { + unsigned compacted_mrt_index = 0; + /* Export all color render targets. */ for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i) { if (!ctx->outputs.mask[i]) @@ -11317,9 +11319,9 @@ create_fs_exports(isel_context* ctx) struct mrt_color_export out = {0}; - out.slot = i - FRAG_RESULT_DATA0; + out.slot = compacted_mrt_index; out.write_mask = ctx->outputs.mask[i]; - out.col_format = (ctx->options->key.ps.col_format >> (4 * out.slot)) & 0xf; + out.col_format = (ctx->options->key.ps.col_format >> (4 * (i - FRAG_RESULT_DATA0))) & 0xf; for (unsigned c = 0; c < 4; ++c) { if (out.write_mask & (1 << c)) { @@ -11329,7 +11331,10 @@ create_fs_exports(isel_context* ctx) } } - exported |= export_fs_mrt_color(ctx, &out, false); + if (export_fs_mrt_color(ctx, &out, false)) { + compacted_mrt_index++; + exported = true; + } } if (!exported) @@ -12449,6 +12454,7 @@ select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key, ac_shade Builder bld(ctx.program, ctx.block); /* Export all color render targets */ + unsigned compacted_mrt_index = 0; bool exported = false; for (unsigned i = 0; i < 8; i++) { @@ -12459,7 +12465,7 @@ select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key, ac_shade struct mrt_color_export out; - out.slot = i; + out.slot = compacted_mrt_index; out.write_mask = 0xf; out.col_format = col_format; out.is_int8 = (key->color_is_int8 >> i) & 1; @@ -12471,7 +12477,10 @@ select_ps_epilog(Program* program, const struct aco_ps_epilog_key* key, ac_shade out.values[c] = Operand(emit_extract_vector(&ctx, inputs, c, v1)); } - exported |= export_fs_mrt_color(&ctx, &out, true); + if (export_fs_mrt_color(&ctx, &out, true)) { + compacted_mrt_index++; + exported = true; + } } if (!exported) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index f882d50ec19..adf01c7630b 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -578,7 +578,8 @@ scan_shader_output_decl(struct radv_shader_context *ctx, struct nir_variable *va /* Initialize arguments for the shader export intrinsic */ static void si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values, - unsigned enabled_channels, unsigned target, struct ac_export_args *args) + unsigned enabled_channels, unsigned target, unsigned index, + struct ac_export_args *args) { /* Specify the channels that are enabled. */ args->enabled_channels = enabled_channels; @@ -603,7 +604,6 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values, bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2; if (ctx->stage == MESA_SHADER_FRAGMENT) { - unsigned index = target - V_008DFC_SQ_EXP_MRT; unsigned col_format = (ctx->options->key.ps.col_format >> (4 * index)) & 0xf; bool is_int8 = (ctx->options->key.ps.is_int8 >> index) & 1; bool is_int10 = (ctx->options->key.ps.is_int10 >> index) & 1; @@ -743,7 +743,7 @@ radv_export_param(struct radv_shader_context *ctx, unsigned index, LLVMValueRef { struct ac_export_args args; - si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, &args); + si_llvm_init_export_args(ctx, values, enabled_channels, V_008DFC_SQ_EXP_PARAM + index, 0, &args); ac_build_export(&ctx->ac, &args); } @@ -915,7 +915,7 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_v for (i = 0; i < noutput; i++) { switch (outputs[i].slot_name) { case VARYING_SLOT_POS: - si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, &pos_args[0]); + si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS, 0, &pos_args[0]); break; case VARYING_SLOT_PSIZ: psize_value = outputs[i].values[0]; @@ -932,7 +932,7 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_v case VARYING_SLOT_CLIP_DIST0: case VARYING_SLOT_CLIP_DIST1: index = 2 + outputs[i].slot_index; - si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index, + si_llvm_init_export_args(ctx, outputs[i].values, 0xf, V_008DFC_SQ_EXP_POS + index, 0, &pos_args[index]); break; default: @@ -1064,11 +1064,11 @@ handle_vs_outputs_post(struct radv_shader_context *ctx) } static bool -si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned index, - struct ac_export_args *args) +si_export_mrt_color(struct radv_shader_context *ctx, LLVMValueRef *color, unsigned target, + unsigned index, struct ac_export_args *args) { /* Export */ - si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + index, args); + si_llvm_init_export_args(ctx, color, 0xf, V_008DFC_SQ_EXP_MRT + target, index, args); if (!args->enabled_channels) return false; /* unnecessary NULL export */ @@ -1105,7 +1105,7 @@ handle_fs_outputs_post(struct radv_shader_context *ctx) for (unsigned j = 0; j < 4; j++) values[j] = ac_to_float(&ctx->ac, radv_load_output(ctx, i, j)); - bool ret = si_export_mrt_color(ctx, values, i - FRAG_RESULT_DATA0, &color_args[index]); + bool ret = si_export_mrt_color(ctx, values, index, i - FRAG_RESULT_DATA0, &color_args[index]); if (ret) index++; } diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 71b3580346f..2df56bdc0f5 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -533,13 +533,41 @@ format_is_float32(VkFormat format) desc->channel[channel].type == UTIL_FORMAT_TYPE_FLOAT && desc->channel[channel].size == 32; } +static unsigned +radv_compact_spi_shader_col_format(const struct radv_shader *ps, + const struct radv_blend_state *blend) +{ + unsigned spi_shader_col_format = blend->spi_shader_col_format; + unsigned value = 0, num_mrts = 0; + unsigned i, num_targets; + + /* Make sure to clear color attachments without exports because MRT holes are removed during + * compilation for optimal performance. + */ + spi_shader_col_format &= ps->info.ps.colors_written; + + /* Compute the number of MRTs. */ + num_targets = DIV_ROUND_UP(util_last_bit(spi_shader_col_format), 4); + + /* Remove holes in spi_shader_col_format. */ + for (i = 0; i < num_targets; i++) { + unsigned spi_format = (spi_shader_col_format >> (i * 4)) & 0xf; + + if (spi_format) { + value |= spi_format << (num_mrts * 4); + num_mrts++; + } + } + + return value; +} + static void radv_pipeline_compute_spi_color_formats(const struct radv_graphics_pipeline *pipeline, struct radv_blend_state *blend, const struct vk_graphics_pipeline_state *state) { unsigned col_format = 0, is_int8 = 0, is_int10 = 0, is_float32 = 0; - unsigned num_targets; for (unsigned i = 0; i < state->rp->color_attachment_count; ++i) { unsigned cf; @@ -572,16 +600,6 @@ radv_pipeline_compute_spi_color_formats(const struct radv_graphics_pipeline *pip col_format |= V_028714_SPI_SHADER_32_AR; } - /* If the i-th target format is set, all previous target formats must - * be non-zero to avoid hangs. - */ - num_targets = (util_last_bit(col_format) + 3) / 4; - for (unsigned i = 0; i < num_targets; i++) { - if (!(col_format & (0xfu << (i * 4)))) { - col_format |= V_028714_SPI_SHADER_32_R << (i * 4); - } - } - /* The output for dual source blending should have the same format as * the first output. */ @@ -6030,6 +6048,9 @@ radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv if (device->physical_device->rad_info.gfx_level >= GFX10_3) gfx103_pipeline_init_vrs_state(pipeline, &state); + struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT]; + blend.spi_shader_col_format = radv_compact_spi_shader_col_format(ps, &blend); + /* Ensure that some export memory is always allocated, for two reasons: * * 1) Correctness: The hardware ignores the EXEC mask if no export @@ -6045,7 +6066,6 @@ radv_graphics_pipeline_init(struct radv_graphics_pipeline *pipeline, struct radv * color and Z formats to SPI_SHADER_ZERO. The hw will skip export * instructions if any are present. */ - struct radv_shader *ps = pipeline->base.shaders[MESA_SHADER_FRAGMENT]; if ((device->physical_device->rad_info.gfx_level <= GFX9 || ps->info.ps.can_discard) && !blend.spi_shader_col_format) { if (!ps->info.ps.writes_z && !ps->info.ps.writes_stencil && !ps->info.ps.writes_sample_mask)