From 755227baa6d651b158fa560eb8238864bc0eef12 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 7 Feb 2021 11:09:06 -0500 Subject: [PATCH] pan/mdg: Push uniforms based on UBO analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skips over "holes" in UBO ranges and allows pushing things other than UBO #0 (GL uniforms) and sysvals. shader-db results relative to beginning of series (so includes the hurt from lowering UBO to uniforms): total instructions in shared programs: 96611 -> 95018 (-1.65%) instructions in affected programs: 22356 -> 20763 (-7.13%) helped: 204 HURT: 13 helped stats (abs) min: 1 max: 27 x̄: 8.18 x̃: 7 helped stats (rel) min: 0.42% max: 26.09% x̄: 8.60% x̃: 8.07% HURT stats (abs) min: 1 max: 33 x̄: 5.77 x̃: 2 HURT stats (rel) min: 0.47% max: 15.64% x̄: 3.56% x̃: 1.72% 95% mean confidence interval for instructions value: -8.29 -6.39 95% mean confidence interval for instructions %-change: -8.74% -7.00% Instructions are helped. total bundles in shared programs: 44886 -> 44790 (-0.21%) bundles in affected programs: 9640 -> 9544 (-1.00%) helped: 131 HURT: 70 helped stats (abs) min: 1 max: 11 x̄: 4.34 x̃: 4 helped stats (rel) min: 1.04% max: 42.31% x̄: 10.39% x̃: 9.84% HURT stats (abs) min: 1 max: 16 x̄: 6.76 x̃: 6 HURT stats (rel) min: 2.22% max: 37.50% x̄: 13.78% x̃: 10.00% 95% mean confidence interval for bundles value: -1.37 0.42 95% mean confidence interval for bundles %-change: -3.99% 0.04% Inconclusive result (value mean confidence interval includes 0). total quadwords in shared programs: 76320 -> 75140 (-1.55%) quadwords in affected programs: 16691 -> 15511 (-7.07%) helped: 206 HURT: 5 helped stats (abs) min: 1 max: 18 x̄: 5.91 x̃: 6 helped stats (rel) min: 0.36% max: 27.78% x̄: 7.93% x̃: 8.33% HURT stats (abs) min: 1 max: 19 x̄: 7.40 x̃: 1 HURT stats (rel) min: 0.55% max: 15.79% x̄: 7.39% x̃: 3.57% 95% mean confidence interval for quadwords value: -6.19 -5.00 95% mean confidence interval for quadwords %-change: -8.32% -6.82% Quadwords are helped. total registers in shared programs: 6958 -> 6827 (-1.88%) registers in affected programs: 1083 -> 952 (-12.10%) helped: 112 HURT: 16 helped stats (abs) min: 1 max: 3 x̄: 1.32 x̃: 1 helped stats (rel) min: 6.25% max: 50.00% x̄: 17.13% x̃: 12.50% HURT stats (abs) min: 1 max: 2 x̄: 1.06 x̃: 1 HURT stats (rel) min: 9.09% max: 20.00% x̄: 11.97% x̃: 11.81% 95% mean confidence interval for registers value: -1.19 -0.86 95% mean confidence interval for registers %-change: -15.78% -11.21% Registers are helped. total threads in shared programs: 5109 -> 5153 (0.86%) threads in affected programs: 62 -> 106 (70.97%) helped: 42 HURT: 6 helped stats (abs) min: 1 max: 2 x̄: 1.19 x̃: 1 helped stats (rel) min: 100.00% max: 100.00% x̄: 100.00% x̃: 100.00% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 50.00% max: 50.00% x̄: 50.00% x̃: 50.00% 95% mean confidence interval for threads value: 0.68 1.16 95% mean confidence interval for threads %-change: 66.69% 95.81% Threads are helped. Signed-off-by: Alyssa Rosenzweig Reviewed-by: Boris Brezillon Part-of: --- src/panfrost/midgard/compiler.h | 1 + src/panfrost/midgard/midgard_compile.c | 2 + src/panfrost/midgard/midgard_ra.c | 6 +- src/panfrost/midgard/midgard_schedule.c | 2 +- src/panfrost/midgard/mir_promote_uniforms.c | 150 +++++++++++++++++--- src/panfrost/util/pan_ir.c | 19 +++ src/panfrost/util/pan_ir.h | 6 + 7 files changed, 163 insertions(+), 23 deletions(-) diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h index 3532c7d7e96..bc2c687d58d 100644 --- a/src/panfrost/midgard/compiler.h +++ b/src/panfrost/midgard/compiler.h @@ -321,6 +321,7 @@ typedef struct compiler_context { midgard_instruction *writeout_branch[MIDGARD_NUM_RTS][MIDGARD_MAX_SAMPLE_ITER]; struct panfrost_sysvals sysvals; + struct panfrost_ubo_push *push; } compiler_context; /* Per-block live_in/live_out */ diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index fcfb145263f..aa42e4ca161 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -2986,6 +2986,8 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, ctx->stage = nir->info.stage; ctx->is_blend = inputs->is_blend; ctx->blend_rt = MIDGARD_COLOR_RT0 + inputs->blend.rt; + ctx->push = &program->push; + if (inputs->is_blend) { unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1); const struct util_format_description *desc = diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c index 72c06ade683..5e746d962ab 100644 --- a/src/panfrost/midgard/midgard_ra.c +++ b/src/panfrost/midgard/midgard_ra.c @@ -977,6 +977,8 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) midgard_instruction *before = ins; unsigned temp = make_compiler_temp(ctx); + unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4; + assert(idx < ctx->push->count); midgard_instruction ld = { .type = TAG_LOAD_STORE_4, @@ -987,12 +989,12 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) .swizzle = SWIZZLE_IDENTITY_4, .op = midgard_op_ld_ubo_int4, .load_store = { + .arg_1 = ctx->push->words[idx].ubo, .arg_2 = 0x1E, }, + .constants.u32[0] = ctx->push->words[idx].offset }; - ld.constants.u32[0] = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 16; - mir_insert_instruction_before_scheduled(ctx, block, before, ld); mir_rewrite_index_src_single(ins, ins->src[i], temp); diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c index 399aecb852e..973af220c2e 100644 --- a/src/panfrost/midgard/midgard_schedule.c +++ b/src/panfrost/midgard/midgard_schedule.c @@ -1447,7 +1447,7 @@ schedule_block(compiler_context *ctx, midgard_block *block) void midgard_schedule_program(compiler_context *ctx) { -// midgard_promote_uniforms(ctx); + midgard_promote_uniforms(ctx); /* Must be lowered right before scheduling */ mir_squeeze_index(ctx); diff --git a/src/panfrost/midgard/mir_promote_uniforms.c b/src/panfrost/midgard/mir_promote_uniforms.c index 01fb9d996d9..b5e063e0600 100644 --- a/src/panfrost/midgard/mir_promote_uniforms.c +++ b/src/panfrost/midgard/mir_promote_uniforms.c @@ -37,26 +37,124 @@ */ static bool -mir_is_promoteable_ubo(midgard_instruction *ins) +mir_is_direct_aligned_ubo(midgard_instruction *ins) { - /* TODO: promote unaligned access via swizzle? */ - return (ins->type == TAG_LOAD_STORE_4) && (OP_IS_UBO_READ(ins->op)) && !(ins->constants.u32[0] & 0xF) && - !(ins->load_store.arg_1) && - (ins->load_store.arg_2 == 0x1E) && - ((ins->constants.u32[0] / 16) < 16); + (ins->src[1] == ~0) && + (ins->src[2] == ~0); } +/* Represents use data for a single UBO */ + +#define MAX_UBO_QWORDS (65536 / 16) + +struct mir_ubo_block { + BITSET_DECLARE(uses, MAX_UBO_QWORDS); + BITSET_DECLARE(pushed, MAX_UBO_QWORDS); +}; + +struct mir_ubo_analysis { + /* Per block analysis */ + unsigned nr_blocks; + struct mir_ubo_block *blocks; +}; + +static struct mir_ubo_analysis +mir_analyze_ranges(compiler_context *ctx) +{ + struct mir_ubo_analysis res = { + .nr_blocks = ctx->nir->info.num_ubos + 1, + }; + + res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block)); + + mir_foreach_instr_global(ctx, ins) { + if (!mir_is_direct_aligned_ubo(ins)) continue; + + unsigned ubo = ins->load_store.arg_1; + unsigned offset = ins->constants.u32[0] / 16; + + assert(ubo < res.nr_blocks); + + if (offset < MAX_UBO_QWORDS) + BITSET_SET(res.blocks[ubo].uses, offset); + } + + return res; +} + +/* Select UBO words to push. A sophisticated implementation would consider the + * number of uses and perhaps the control flow to estimate benefit. This is not + * sophisticated. Select from the last UBO first to prioritize sysvals. */ + +static void +mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords) +{ + unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4); + + for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { + struct mir_ubo_block *block = &analysis->blocks[ubo]; + + unsigned vec4; + BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) { + /* Don't push more than possible */ + if (push->count > max_words - 4) + return; + + for (unsigned offs = 0; offs < 4; ++offs) { + struct panfrost_ubo_word word = { + .ubo = ubo, + .offset = (vec4 * 16) + (offs * 4) + }; + + push->words[push->count++] = word; + } + + /* Mark it as pushed so we can rewrite */ + BITSET_SET(block->pushed, vec4); + } + } +} + +#if 0 +static void +mir_dump_ubo_analysis(struct mir_ubo_analysis *res) +{ + printf("%u blocks\n", res->nr_blocks); + + for (unsigned i = 0; i < res->nr_blocks; ++i) { + BITSET_WORD *uses = res->blocks[i].uses; + BITSET_WORD *push = res->blocks[i].pushed; + + unsigned last = BITSET_LAST_BIT(uses, BITSET_WORDS(MAX_UBO_QWORDS)); + + printf("\t"); + + for (unsigned j = 0; j < last; ++j) { + bool used = BITSET_TEST(uses, j); + bool pushed = BITSET_TEST(push, j); + assert(used || !pushed); + + putchar(pushed ? '*' : used ? '-' : '_'); + } + + printf("\n"); + } +} +#endif + static unsigned -mir_promoteable_uniform_count(compiler_context *ctx) +mir_promoteable_uniform_count(struct mir_ubo_analysis *analysis) { unsigned count = 0; - mir_foreach_instr_global(ctx, ins) { - if (mir_is_promoteable_ubo(ins)) - count = MAX2(count, ins->constants.u32[0] / 16); + for (unsigned i = 0; i < analysis->nr_blocks; ++i) { + BITSET_WORD *uses = analysis->blocks[i].uses; + + for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w) + count += util_bitcount(uses[w]); } return count; @@ -98,9 +196,9 @@ mir_estimate_pressure(compiler_context *ctx) } static unsigned -mir_work_heuristic(compiler_context *ctx) +mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis) { - unsigned uniform_count = mir_promoteable_uniform_count(ctx); + unsigned uniform_count = mir_promoteable_uniform_count(analysis); /* If there are 8 or fewer uniforms, it doesn't matter what we do, so * allow as many work registers as needed */ @@ -160,24 +258,35 @@ mir_special_indices(compiler_context *ctx) void midgard_promote_uniforms(compiler_context *ctx) { - unsigned work_count = mir_work_heuristic(ctx); + struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx); + + unsigned work_count = mir_work_heuristic(ctx, &analysis); unsigned promoted_count = 24 - work_count; + mir_pick_ubo(ctx->push, &analysis, promoted_count); + /* First, figure out special indices a priori so we don't recompute a lot */ BITSET_WORD *special = mir_special_indices(ctx); mir_foreach_instr_global_safe(ctx, ins) { - if (!mir_is_promoteable_ubo(ins)) continue; + if (!mir_is_direct_aligned_ubo(ins)) continue; - unsigned off = ins->constants.u32[0]; - unsigned address = off / 16; + unsigned ubo = ins->load_store.arg_1; + unsigned qword = ins->constants.u32[0] / 16; - /* Check if it's a promotable range */ + /* Check if we decided to push this */ + assert(ubo < analysis.nr_blocks); + if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) continue; + + /* Find where we pushed to, TODO: unaligned pushes to pack */ + unsigned base = pan_lookup_pushed_ubo(ctx->push, ubo, qword * 16); + assert((base & 0x3) == 0); + + unsigned address = base / 4; unsigned uniform_reg = 23 - address; - if (address >= promoted_count) continue; - - /* It is, great! Let's promote */ + /* Should've taken into account when pushing */ + assert(address < promoted_count); ctx->uniform_cutoff = MAX2(ctx->uniform_cutoff, address + 1); unsigned promoted = SSA_FIXED_REGISTER(uniform_reg); @@ -207,4 +316,5 @@ midgard_promote_uniforms(compiler_context *ctx) } free(special); + free(analysis.blocks); } diff --git a/src/panfrost/util/pan_ir.c b/src/panfrost/util/pan_ir.c index a58fa631843..c469274933f 100644 --- a/src/panfrost/util/pan_ir.c +++ b/src/panfrost/util/pan_ir.c @@ -129,3 +129,22 @@ pan_print_alu_type(nir_alu_type t, FILE *fp) fprintf(fp, "%u", size); } + +/* Could optimize with a better data structure if anyone cares, TODO: profile */ + +unsigned +pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs) +{ + struct panfrost_ubo_word word = { + .ubo = ubo, + .offset = offs + }; + + for (unsigned i = 0; i < push->count; ++i) { + if (memcmp(push->words + i, &word, sizeof(word)) == 0) + return i; + } + + unreachable("UBO not pushed"); + +} diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 02af7174911..9622a4bc890 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -97,6 +97,12 @@ struct panfrost_ubo_push { struct panfrost_ubo_word words[PAN_MAX_PUSH]; }; +/* Helper for searching the above. Note this is O(N) to the number of pushed + * constants, do not run in the draw call hot path */ + +unsigned +pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs); + void panfrost_nir_assign_sysvals(struct panfrost_sysvals *ctx, void *memctx, nir_shader *shader);