radv, aco, ac/nir: Tweak position export scheduling for NGG culling.

The result is about +5-ish fps in Doom Eternal.

It turns out that the location of position exports matters more
than we thought, and it's actually better to keep them at the bottom
for culling shaders rather than schedule it up to the top.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>
This commit is contained in:
Timur Kristóf 2021-07-05 15:26:18 +02:00 committed by Marge Bot
parent 0bb543bb60
commit 8341af5109
5 changed files with 20 additions and 3 deletions

View file

@ -96,6 +96,7 @@ typedef struct
unsigned lds_bytes_if_culling_off;
bool can_cull;
bool passthrough;
bool early_prim_export;
uint64_t nggc_inputs_read_by_pos;
uint64_t nggc_inputs_read_by_others;
} ac_nir_ngg_config;

View file

@ -1290,6 +1290,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
.lds_bytes_if_culling_off = lds_bytes_if_culling_off,
.can_cull = can_cull,
.passthrough = passthrough,
.early_prim_export = state.early_prim_export,
.nggc_inputs_read_by_pos = state.inputs_needed_by_pos,
.nggc_inputs_read_by_others = state.inputs_needed_by_others,
};

View file

@ -126,6 +126,8 @@ struct sched_ctx {
int16_t last_SMEM_stall;
int last_SMEM_dep_idx;
MoveState mv;
bool schedule_pos_exports = true;
unsigned schedule_pos_export_div = 1;
};
/* This scheduler is a simple bottom-up pass based on ideas from
@ -928,8 +930,8 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
Instruction* current, int idx)
{
assert(idx != 0);
int window_size = POS_EXP_WINDOW_SIZE;
int max_moves = POS_EXP_MAX_MOVES;
int window_size = POS_EXP_WINDOW_SIZE / ctx.schedule_pos_export_div;
int max_moves = POS_EXP_MAX_MOVES / ctx.schedule_pos_export_div;
int16_t k = 0;
DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false);
@ -982,7 +984,7 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
Instruction* current = block->instructions[idx].get();
if (block->kind & block_kind_export_end && current->isEXP()) {
if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) {
unsigned target = current->exp().dest;
if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
ctx.mv.current = current;
@ -1048,6 +1050,17 @@ schedule_program(Program* program, live& live_vars)
ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
/* NGG culling shaders are very sensitive to position export scheduling.
* Schedule less aggressively when early primitive export is used, and
* keep the position export at the very bottom when late primitive export is used.
*/
if (program->info->has_ngg_culling && program->stage.num_sw_stages() == 1) {
if (!program->info->has_ngg_early_prim_export)
ctx.schedule_pos_exports = false;
else
ctx.schedule_pos_export_div = 4;
}
for (Block& block : program->blocks)
schedule_block(ctx, program, &block, live_vars);

View file

@ -1017,6 +1017,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
key->vs.provoking_vtx_last);
info->has_ngg_culling = out_conf.can_cull;
info->has_ngg_early_prim_export = out_conf.early_prim_export;
info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
info->is_ngg_passthrough = out_conf.passthrough;
key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;

View file

@ -264,6 +264,7 @@ struct radv_shader_info {
bool is_ngg;
bool is_ngg_passthrough;
bool has_ngg_culling;
bool has_ngg_early_prim_export;
uint32_t num_lds_blocks_when_not_culling;
uint32_t num_tess_patches;
struct {