radv: Remove PSIZ output when it isn't needed.

PSIZ output is only needed when:
1. There is a next stage and it reads it.
2. Primitive topology is point list, in the last vertex pipeline stage.

Zink always adds this output in its vertex (and other) shaders,
because it helps Zink avoid recompiling shader variants.

However, this has a performance impact for RADV because
it needs a scalar memory load. That becomes noticeable
at high primitive rates.

The Fossil stats are unremarkable because our DB doesn't include any
shaders from Zink or D9VK, but there are a few affected shaders.

Note that there may be an increase in LDS use in some GS. This is
because with PSIZ removed the ES per-vertex LDS size is smaller, so
we can squeeze more GS threads in the same workgroup.

Fossil DB stats on Sienna Cichlid:

Totals from 14 (0.01% of 128647) affected shaders:
CodeSize: 119884 -> 119732 (-0.13%)
LDS: 235008 -> 228864 (-2.61%); split: -2.83%, +0.22%
Instrs: 23076 -> 23048 (-0.12%)
Latency: 71667 -> 71625 (-0.06%)
InvThroughput: 19155 -> 18870 (-1.49%)
Copies: 1586 -> 1572 (-0.88%)

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10725>
This commit is contained in:
Timur Kristóf 2021-05-10 13:15:12 +02:00 committed by Marge Bot
parent a2c30c1488
commit 92e1981a80

View file

@ -2305,7 +2305,9 @@ get_vs_output_info(const struct radv_pipeline *pipeline)
}
static void
radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders,
radv_link_shaders(struct radv_pipeline *pipeline,
const struct radv_pipeline_key *pipeline_key,
nir_shader **shaders,
bool optimize_conservatively)
{
nir_shader *ordered_shaders[MESA_SHADER_STAGES];
@ -2389,6 +2391,39 @@ radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders,
}
}
if (!optimize_conservatively) {
/* Remove PSIZ from shaders when it's not needed.
* This is typically produced by translation layers like Zink or D9VK.
*/
for (unsigned i = 0; i < shader_count; ++i) {
shader_info *info = &ordered_shaders[i]->info;
if (!(info->outputs_written & VARYING_BIT_PSIZ))
continue;
bool next_stage_needs_psiz =
i != 0 && /* ordered_shaders is backwards, so next stage is: i - 1 */
ordered_shaders[i - 1]->info.inputs_read & VARYING_BIT_PSIZ;
bool topology_uses_psiz =
info->stage == pipeline->graphics.last_vgt_api_stage &&
((info->stage == MESA_SHADER_VERTEX && pipeline_key->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ||
(info->stage == MESA_SHADER_TESS_EVAL && info->tess.point_mode) ||
(info->stage == MESA_SHADER_GEOMETRY && info->gs.output_primitive == GL_POINTS));
if (!next_stage_needs_psiz && !topology_uses_psiz) {
/* Change PSIZ to a global variable which allows it to be DCE'd. */
nir_variable *psiz_var =
nir_find_variable_with_location(ordered_shaders[i], nir_var_shader_out, VARYING_SLOT_PSIZ);
psiz_var->data.location = 0;
psiz_var->data.mode = nir_var_shader_temp;
info->outputs_written &= ~VARYING_BIT_PSIZ;
nir_fixup_deref_modes(ordered_shaders[i]);
nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_temp, NULL);
nir_opt_dce(ordered_shaders[i]);
}
}
}
for (int i = 1; !optimize_conservatively && (i < shader_count); ++i) {
if (nir_link_opt_varyings(ordered_shaders[i], ordered_shaders[i - 1])) {
nir_opt_constant_folding(ordered_shaders[i - 1]);
@ -3395,7 +3430,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
bool optimize_conservatively = flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT;
radv_link_shaders(pipeline, nir, optimize_conservatively);
radv_link_shaders(pipeline, pipeline_key, nir, optimize_conservatively);
radv_set_driver_locations(pipeline, nir, infos);
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {