From aa322a37fcb6ab58c389b48186268af5f041a62c Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Mon, 31 Jul 2023 19:57:06 +0200 Subject: [PATCH] ir3: Implement helper invocation optimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This kills helper invocations to ensure that subsequent memory accesses don't fetch unused memory and unnecessary branch divergence from helper invocations is eliminated. shader-db results: total instructions in shared programs: 3840580 -> 3841531 (0.02%) instructions in affected programs: 278416 -> 279367 (0.34%) helped: 0 HURT: 744 HURT stats (abs) min: 1 max: 16 x̄: 1.28 x̃: 1 HURT stats (rel) min: 0.05% max: 8.51% x̄: 0.75% x̃: 0.39% 95% mean confidence interval for instructions value: 1.22 1.34 95% mean confidence interval for instructions %-change: 0.67% 0.83% Instructions are HURT. total nops in shared programs: 866716 -> 867667 (0.11%) nops in affected programs: 72851 -> 73802 (1.31%) helped: 0 HURT: 744 HURT stats (abs) min: 1 max: 16 x̄: 1.28 x̃: 1 HURT stats (rel) min: 0.17% max: 33.33% x̄: 2.84% x̃: 1.82% 95% mean confidence interval for nops value: 1.22 1.34 95% mean confidence interval for nops %-change: 2.59% 3.08% Nops are HURT. total last-baryf in shared programs: 139806 -> 139864 (0.04%) last-baryf in affected programs: 11772 -> 11830 (0.49%) helped: 0 HURT: 58 HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.40% max: 5.26% x̄: 0.60% x̃: 0.47% 95% mean confidence interval for last-baryf value: 1.00 1.00 95% mean confidence interval for last-baryf %-change: 0.42% 0.78% Last-baryf are HURT. total last-helper in shared programs: 1508295 -> 935561 (-37.97%) last-helper in affected programs: 1192594 -> 619860 (-48.02%) helped: 7816 HURT: 3 helped stats (abs) min: 1 max: 1095 x̄: 73.28 x̃: 34 helped stats (rel) min: 0.42% max: 100.00% x̄: 71.91% x̃: 100.00% HURT stats (abs) min: 1 max: 11 x̄: 4.67 x̃: 2 HURT stats (rel) min: 0.80% max: 1.44% x̄: 1.03% x̃: 0.86% 95% mean confidence interval for last-helper value: -75.64 -70.86 95% mean confidence interval for last-helper %-change: -72.67% -71.10% Last-helper are helped. fossil-db results: Totals: Instrs: 55172795 -> 55189122 (+0.03%) CodeSize: 108952746 -> 108984452 (+0.03%) NOPs: 11536680 -> 11553007 (+0.14%) (ss)-stall: 4166810 -> 4166581 (-0.01%) (sy)-stall: 15890324 -> 15884974 (-0.03%) last-baryf: 659588 -> 659633 (+0.01%) last-helper: 25742996 -> 12601636 (-51.05%); split: -51.05%, +0.00% Cat0: 12294891 -> 12311218 (+0.13%) Totals from 39576 (25.22% of 156916) affected shaders: Instrs: 24200008 -> 24216335 (+0.07%) CodeSize: 44968736 -> 45000442 (+0.07%) NOPs: 5854965 -> 5871292 (+0.28%) (ss)-stall: 2357830 -> 2357601 (-0.01%) (sy)-stall: 6166670 -> 6161320 (-0.09%) last-baryf: 590330 -> 590375 (+0.01%) last-helper: 24160432 -> 11019072 (-54.39%); split: -54.39%, +0.00% Cat0: 6205561 -> 6221888 (+0.26%) Part-of: --- src/freedreno/ir3/ir3.h | 56 ++++++++ src/freedreno/ir3/ir3_legalize.c | 214 +++++++++++++++++++++++++++++++ 2 files changed, 270 insertions(+) diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 93acf03009a..167db6ba8e2 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -691,6 +691,12 @@ ir3_start_block(struct ir3 *ir) return list_first_entry(&ir->block_list, struct ir3_block, node); } +static inline struct ir3_block * +ir3_end_block(struct ir3 *ir) +{ + return list_last_entry(&ir->block_list, struct ir3_block, node); +} + static inline struct ir3_block * ir3_after_preamble(struct ir3 *ir) { @@ -1074,6 +1080,53 @@ is_input(struct ir3_instruction *instr) } } +/* Whether non-helper invocations can read the value of helper invocations. We + * cannot insert (eq) before these instructions. + */ +static inline bool +uses_helpers(struct ir3_instruction *instr) +{ + switch (instr->opc) { + /* These require helper invocations to be present */ + case OPC_SAM: + case OPC_SAMB: + case OPC_GETLOD: + case OPC_DSX: + case OPC_DSY: + case OPC_DSXPP_1: + case OPC_DSYPP_1: + case OPC_DSXPP_MACRO: + case OPC_DSYPP_MACRO: + case OPC_QUAD_SHUFFLE_BRCST: + case OPC_QUAD_SHUFFLE_HORIZ: + case OPC_QUAD_SHUFFLE_VERT: + case OPC_QUAD_SHUFFLE_DIAG: + case OPC_META_TEX_PREFETCH: + return true; + + /* Subgroup operations don't require helper invocations to be present, but + * will use helper invocations if they are present. + */ + case OPC_BALLOT_MACRO: + case OPC_ANY_MACRO: + case OPC_ALL_MACRO: + case OPC_ELECT_MACRO: + case OPC_READ_FIRST_MACRO: + case OPC_READ_COND_MACRO: + case OPC_MOVMSK: + case OPC_BRCST_ACTIVE: + return true; + + /* Catch lowered READ_FIRST/READ_COND. */ + case OPC_MOV: + return (instr->dsts[0]->flags & IR3_REG_SHARED) && + !(instr->srcs[0]->flags & IR3_REG_SHARED); + + default: + return false; + } +} + static inline bool is_bool(struct ir3_instruction *instr) { @@ -1704,6 +1757,9 @@ __ssa_srcp_n(struct ir3_instruction *instr, unsigned n) /* iterators for instructions: */ #define foreach_instr(__instr, __list) \ list_for_each_entry (struct ir3_instruction, __instr, __list, node) +#define foreach_instr_from(__instr, __start, __list) \ + list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \ + __list, node) #define foreach_instr_rev(__instr, __list) \ list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node) #define foreach_instr_safe(__instr, __list) \ diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index a344a009ea1..0626113d7a0 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -908,6 +908,215 @@ nop_sched(struct ir3 *ir, struct ir3_shader_variant *so) } } +struct ir3_helper_block_data { + /* Whether helper invocations may be used on any path starting at the + * beginning of the block. + */ + bool uses_helpers_beginning; + + /* Whether helper invocations may be used by the end of the block. Branch + * instructions are considered to be "between" blocks, because (eq) has to be + * inserted after them in the successor blocks, so branch instructions using + * helpers will result in uses_helpers_end = true for their block. + */ + bool uses_helpers_end; +}; + +/* Insert (eq) after the last instruction using the results of helper + * invocations. Use a backwards dataflow analysis to determine at which points + * in the program helper invocations are definitely never used, and then insert + * (eq) at the point where we cross from a point where they may be used to a + * point where they are never used. + */ +static void +helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir, + struct ir3_shader_variant *so) +{ + bool non_prefetch_helpers = false; + + foreach_block (block, &ir->block_list) { + struct ir3_helper_block_data *bd = + rzalloc(ctx, struct ir3_helper_block_data); + foreach_instr (instr, &block->instr_list) { + if (uses_helpers(instr)) { + bd->uses_helpers_beginning = true; + if (instr->opc != OPC_META_TEX_PREFETCH) { + non_prefetch_helpers = true; + break; + } + } + + if (instr->opc == OPC_SHPE) { + /* (eq) is not allowed in preambles, mark the whole preamble as + * requiring helpers to avoid putting it there. + */ + bd->uses_helpers_beginning = true; + bd->uses_helpers_end = true; + } + } + + if (block->brtype == IR3_BRANCH_ALL || + block->brtype == IR3_BRANCH_ANY || + block->brtype == IR3_BRANCH_GETONE) { + bd->uses_helpers_end = true; + } + + block->data = bd; + } + + /* If only prefetches use helpers then we can disable them in the shader via + * a register setting. + */ + if (!non_prefetch_helpers) { + so->prefetch_end_of_quad = true; + return; + } + + bool progress; + do { + progress = false; + foreach_block_rev (block, &ir->block_list) { + struct ir3_helper_block_data *bd = block->data; + + if (!bd->uses_helpers_beginning) + continue; + + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ir3_helper_block_data *pred_bd = pred->data; + if (!pred_bd->uses_helpers_end) { + pred_bd->uses_helpers_end = true; + } + if (!pred_bd->uses_helpers_beginning) { + pred_bd->uses_helpers_beginning = true; + progress = true; + } + } + } + } while (progress); + + /* Now, we need to determine the points where helper invocations become + * unused. + */ + foreach_block (block, &ir->block_list) { + struct ir3_helper_block_data *bd = block->data; + if (bd->uses_helpers_end) + continue; + + /* We need to check the predecessors because of situations with critical + * edges like this that can occur after optimizing jumps: + * + * br p0.x, #endif + * ... + * sam ... + * ... + * endif: + * ... + * end + * + * The endif block will have uses_helpers_beginning = false and + * uses_helpers_end = false, but because we jump to there from the + * beginning of the if where uses_helpers_end = true, we still want to + * add an (eq) at the beginning of the block: + * + * br p0.x, #endif + * ... + * sam ... + * (eq)nop + * ... + * endif: + * (eq)nop + * ... + * end + * + * This an extra nop in the case where the branch isn't taken, but that's + * probably preferable to adding an extra jump instruction which is what + * would happen if we ran this pass before optimizing jumps: + * + * br p0.x, #else + * ... + * sam ... + * (eq)nop + * ... + * jump #endif + * else: + * (eq)nop + * endif: + * ... + * end + * + * We also need this to make sure we insert (eq) after branches which use + * helper invocations. + */ + bool pred_uses_helpers = bd->uses_helpers_beginning; + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ir3_helper_block_data *pred_bd = pred->data; + if (pred_bd->uses_helpers_end) { + pred_uses_helpers = true; + break; + } + } + + if (!pred_uses_helpers) + continue; + + /* The last use of helpers is somewhere between the beginning and the + * end. first_instr will be the first instruction where helpers are no + * longer required, or NULL if helpers are not required just at the end. + */ + struct ir3_instruction *first_instr = NULL; + foreach_instr_rev (instr, &block->instr_list) { + /* Skip prefetches because they actually execute before the block + * starts and at this stage they aren't guaranteed to be at the start + * of the block. + */ + if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH) + break; + first_instr = instr; + } + + bool killed = false; + bool expensive_instruction_in_block = false; + if (first_instr) { + foreach_instr_from (instr, first_instr, &block->instr_list) { + /* If there's already a nop, we don't have to worry about whether to + * insert one. + */ + if (instr->opc == OPC_NOP) { + instr->flags |= IR3_INSTR_EQ; + killed = true; + break; + } + + /* ALU and SFU instructions probably aren't going to benefit much + * from killing helper invocations, because they complete at least + * an entire quad in a cycle and don't access any quad-divergent + * memory, so delay emitting (eq) in the hopes that we find a nop + * afterwards. + */ + if (is_alu(instr) || is_sfu(instr)) + continue; + + expensive_instruction_in_block = true; + break; + } + } + + /* If this block isn't the last block before the end instruction, assume + * that there may be expensive instructions in later blocks so it's worth + * it to insert a nop. + */ + if (!killed && (expensive_instruction_in_block || + block->successors[0] != ir3_end_block(ir))) { + struct ir3_instruction *nop = ir3_NOP(block); + nop->flags |= IR3_INSTR_EQ; + if (first_instr) + ir3_instr_move_before(nop, first_instr); + } + } +} + bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) { @@ -976,6 +1185,11 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) while (opt_jump(ir)) ; + /* TODO: does (eq) exist before a6xx? */ + if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod && + so->compiler->gen >= 6) + helper_sched(ctx, ir, so); + ir3_count_instructions(ir); resolve_jumps(ir);