From 92e1981a8005c0ca7f48f38ddf8ff2b99ce95e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Mon, 10 May 2021 13:15:12 +0200 Subject: [PATCH] radv: Remove PSIZ output when it isn't needed. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PSIZ output is only needed when: 1. There is a next stage and it reads it. 2. Primitive topology is point list, in the last vertex pipeline stage. Zink always adds this output in its vertex (and other) shaders, because it helps Zink avoid recompiling shader variants. However, this has a performance impact for RADV because it needs a scalar memory load. That becomes noticeable at high primitive rates. The Fossil stats are unremarkable because our DB doesn't include any shaders from Zink or D9VK, but there are a few affected shaders. Note that there may be an increase in LDS use in some GS. This is because with PSIZ removed the ES per-vertex LDS size is smaller, so we can squeeze more GS threads in the same workgroup. Fossil DB stats on Sienna Cichlid: Totals from 14 (0.01% of 128647) affected shaders: CodeSize: 119884 -> 119732 (-0.13%) LDS: 235008 -> 228864 (-2.61%); split: -2.83%, +0.22% Instrs: 23076 -> 23048 (-0.12%) Latency: 71667 -> 71625 (-0.06%) InvThroughput: 19155 -> 18870 (-1.49%) Copies: 1586 -> 1572 (-0.88%) Signed-off-by: Timur Kristóf Reviewed-By: Mike Blumenkrantz Part-of: --- src/amd/vulkan/radv_pipeline.c | 39 ++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 414ede8a173..5e24050b4e6 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -2305,7 +2305,9 @@ get_vs_output_info(const struct radv_pipeline *pipeline) } static void -radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders, +radv_link_shaders(struct radv_pipeline *pipeline, + const struct radv_pipeline_key *pipeline_key, + nir_shader **shaders, bool optimize_conservatively) { nir_shader *ordered_shaders[MESA_SHADER_STAGES]; @@ -2389,6 +2391,39 @@ radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders, } } + if (!optimize_conservatively) { + /* Remove PSIZ from shaders when it's not needed. + * This is typically produced by translation layers like Zink or D9VK. + */ + for (unsigned i = 0; i < shader_count; ++i) { + shader_info *info = &ordered_shaders[i]->info; + if (!(info->outputs_written & VARYING_BIT_PSIZ)) + continue; + + bool next_stage_needs_psiz = + i != 0 && /* ordered_shaders is backwards, so next stage is: i - 1 */ + ordered_shaders[i - 1]->info.inputs_read & VARYING_BIT_PSIZ; + bool topology_uses_psiz = + info->stage == pipeline->graphics.last_vgt_api_stage && + ((info->stage == MESA_SHADER_VERTEX && pipeline_key->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) || + (info->stage == MESA_SHADER_TESS_EVAL && info->tess.point_mode) || + (info->stage == MESA_SHADER_GEOMETRY && info->gs.output_primitive == GL_POINTS)); + + if (!next_stage_needs_psiz && !topology_uses_psiz) { + /* Change PSIZ to a global variable which allows it to be DCE'd. */ + nir_variable *psiz_var = + nir_find_variable_with_location(ordered_shaders[i], nir_var_shader_out, VARYING_SLOT_PSIZ); + psiz_var->data.location = 0; + psiz_var->data.mode = nir_var_shader_temp; + + info->outputs_written &= ~VARYING_BIT_PSIZ; + nir_fixup_deref_modes(ordered_shaders[i]); + nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_temp, NULL); + nir_opt_dce(ordered_shaders[i]); + } + } + } + for (int i = 1; !optimize_conservatively && (i < shader_count); ++i) { if (nir_link_opt_varyings(ordered_shaders[i], ordered_shaders[i - 1])) { nir_opt_constant_folding(ordered_shaders[i - 1]); @@ -3395,7 +3430,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device, bool optimize_conservatively = flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT; - radv_link_shaders(pipeline, nir, optimize_conservatively); + radv_link_shaders(pipeline, pipeline_key, nir, optimize_conservatively); radv_set_driver_locations(pipeline, nir, infos); for (int i = 0; i < MESA_SHADER_STAGES; ++i) {