diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h index 3532c7d7e96..bc2c687d58d 100644 --- a/src/panfrost/midgard/compiler.h +++ b/src/panfrost/midgard/compiler.h @@ -321,6 +321,7 @@ typedef struct compiler_context { midgard_instruction *writeout_branch[MIDGARD_NUM_RTS][MIDGARD_MAX_SAMPLE_ITER]; struct panfrost_sysvals sysvals; + struct panfrost_ubo_push *push; } compiler_context; /* Per-block live_in/live_out */ diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index fcfb145263f..aa42e4ca161 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -2986,6 +2986,8 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir, ctx->stage = nir->info.stage; ctx->is_blend = inputs->is_blend; ctx->blend_rt = MIDGARD_COLOR_RT0 + inputs->blend.rt; + ctx->push = &program->push; + if (inputs->is_blend) { unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1); const struct util_format_description *desc = diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c index 72c06ade683..5e746d962ab 100644 --- a/src/panfrost/midgard/midgard_ra.c +++ b/src/panfrost/midgard/midgard_ra.c @@ -977,6 +977,8 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) midgard_instruction *before = ins; unsigned temp = make_compiler_temp(ctx); + unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4; + assert(idx < ctx->push->count); midgard_instruction ld = { .type = TAG_LOAD_STORE_4, @@ -987,12 +989,12 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff) .swizzle = SWIZZLE_IDENTITY_4, .op = midgard_op_ld_ubo_int4, .load_store = { + .arg_1 = ctx->push->words[idx].ubo, .arg_2 = 0x1E, }, + .constants.u32[0] = ctx->push->words[idx].offset }; - ld.constants.u32[0] = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 16; - mir_insert_instruction_before_scheduled(ctx, block, before, ld); mir_rewrite_index_src_single(ins, ins->src[i], temp); diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c index 399aecb852e..973af220c2e 100644 --- a/src/panfrost/midgard/midgard_schedule.c +++ b/src/panfrost/midgard/midgard_schedule.c @@ -1447,7 +1447,7 @@ schedule_block(compiler_context *ctx, midgard_block *block) void midgard_schedule_program(compiler_context *ctx) { -// midgard_promote_uniforms(ctx); + midgard_promote_uniforms(ctx); /* Must be lowered right before scheduling */ mir_squeeze_index(ctx); diff --git a/src/panfrost/midgard/mir_promote_uniforms.c b/src/panfrost/midgard/mir_promote_uniforms.c index 01fb9d996d9..b5e063e0600 100644 --- a/src/panfrost/midgard/mir_promote_uniforms.c +++ b/src/panfrost/midgard/mir_promote_uniforms.c @@ -37,26 +37,124 @@ */ static bool -mir_is_promoteable_ubo(midgard_instruction *ins) +mir_is_direct_aligned_ubo(midgard_instruction *ins) { - /* TODO: promote unaligned access via swizzle? */ - return (ins->type == TAG_LOAD_STORE_4) && (OP_IS_UBO_READ(ins->op)) && !(ins->constants.u32[0] & 0xF) && - !(ins->load_store.arg_1) && - (ins->load_store.arg_2 == 0x1E) && - ((ins->constants.u32[0] / 16) < 16); + (ins->src[1] == ~0) && + (ins->src[2] == ~0); } +/* Represents use data for a single UBO */ + +#define MAX_UBO_QWORDS (65536 / 16) + +struct mir_ubo_block { + BITSET_DECLARE(uses, MAX_UBO_QWORDS); + BITSET_DECLARE(pushed, MAX_UBO_QWORDS); +}; + +struct mir_ubo_analysis { + /* Per block analysis */ + unsigned nr_blocks; + struct mir_ubo_block *blocks; +}; + +static struct mir_ubo_analysis +mir_analyze_ranges(compiler_context *ctx) +{ + struct mir_ubo_analysis res = { + .nr_blocks = ctx->nir->info.num_ubos + 1, + }; + + res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block)); + + mir_foreach_instr_global(ctx, ins) { + if (!mir_is_direct_aligned_ubo(ins)) continue; + + unsigned ubo = ins->load_store.arg_1; + unsigned offset = ins->constants.u32[0] / 16; + + assert(ubo < res.nr_blocks); + + if (offset < MAX_UBO_QWORDS) + BITSET_SET(res.blocks[ubo].uses, offset); + } + + return res; +} + +/* Select UBO words to push. A sophisticated implementation would consider the + * number of uses and perhaps the control flow to estimate benefit. This is not + * sophisticated. Select from the last UBO first to prioritize sysvals. */ + +static void +mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords) +{ + unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4); + + for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) { + struct mir_ubo_block *block = &analysis->blocks[ubo]; + + unsigned vec4; + BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) { + /* Don't push more than possible */ + if (push->count > max_words - 4) + return; + + for (unsigned offs = 0; offs < 4; ++offs) { + struct panfrost_ubo_word word = { + .ubo = ubo, + .offset = (vec4 * 16) + (offs * 4) + }; + + push->words[push->count++] = word; + } + + /* Mark it as pushed so we can rewrite */ + BITSET_SET(block->pushed, vec4); + } + } +} + +#if 0 +static void +mir_dump_ubo_analysis(struct mir_ubo_analysis *res) +{ + printf("%u blocks\n", res->nr_blocks); + + for (unsigned i = 0; i < res->nr_blocks; ++i) { + BITSET_WORD *uses = res->blocks[i].uses; + BITSET_WORD *push = res->blocks[i].pushed; + + unsigned last = BITSET_LAST_BIT(uses, BITSET_WORDS(MAX_UBO_QWORDS)); + + printf("\t"); + + for (unsigned j = 0; j < last; ++j) { + bool used = BITSET_TEST(uses, j); + bool pushed = BITSET_TEST(push, j); + assert(used || !pushed); + + putchar(pushed ? '*' : used ? '-' : '_'); + } + + printf("\n"); + } +} +#endif + static unsigned -mir_promoteable_uniform_count(compiler_context *ctx) +mir_promoteable_uniform_count(struct mir_ubo_analysis *analysis) { unsigned count = 0; - mir_foreach_instr_global(ctx, ins) { - if (mir_is_promoteable_ubo(ins)) - count = MAX2(count, ins->constants.u32[0] / 16); + for (unsigned i = 0; i < analysis->nr_blocks; ++i) { + BITSET_WORD *uses = analysis->blocks[i].uses; + + for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w) + count += util_bitcount(uses[w]); } return count; @@ -98,9 +196,9 @@ mir_estimate_pressure(compiler_context *ctx) } static unsigned -mir_work_heuristic(compiler_context *ctx) +mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis) { - unsigned uniform_count = mir_promoteable_uniform_count(ctx); + unsigned uniform_count = mir_promoteable_uniform_count(analysis); /* If there are 8 or fewer uniforms, it doesn't matter what we do, so * allow as many work registers as needed */ @@ -160,24 +258,35 @@ mir_special_indices(compiler_context *ctx) void midgard_promote_uniforms(compiler_context *ctx) { - unsigned work_count = mir_work_heuristic(ctx); + struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx); + + unsigned work_count = mir_work_heuristic(ctx, &analysis); unsigned promoted_count = 24 - work_count; + mir_pick_ubo(ctx->push, &analysis, promoted_count); + /* First, figure out special indices a priori so we don't recompute a lot */ BITSET_WORD *special = mir_special_indices(ctx); mir_foreach_instr_global_safe(ctx, ins) { - if (!mir_is_promoteable_ubo(ins)) continue; + if (!mir_is_direct_aligned_ubo(ins)) continue; - unsigned off = ins->constants.u32[0]; - unsigned address = off / 16; + unsigned ubo = ins->load_store.arg_1; + unsigned qword = ins->constants.u32[0] / 16; - /* Check if it's a promotable range */ + /* Check if we decided to push this */ + assert(ubo < analysis.nr_blocks); + if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) continue; + + /* Find where we pushed to, TODO: unaligned pushes to pack */ + unsigned base = pan_lookup_pushed_ubo(ctx->push, ubo, qword * 16); + assert((base & 0x3) == 0); + + unsigned address = base / 4; unsigned uniform_reg = 23 - address; - if (address >= promoted_count) continue; - - /* It is, great! Let's promote */ + /* Should've taken into account when pushing */ + assert(address < promoted_count); ctx->uniform_cutoff = MAX2(ctx->uniform_cutoff, address + 1); unsigned promoted = SSA_FIXED_REGISTER(uniform_reg); @@ -207,4 +316,5 @@ midgard_promote_uniforms(compiler_context *ctx) } free(special); + free(analysis.blocks); } diff --git a/src/panfrost/util/pan_ir.c b/src/panfrost/util/pan_ir.c index a58fa631843..c469274933f 100644 --- a/src/panfrost/util/pan_ir.c +++ b/src/panfrost/util/pan_ir.c @@ -129,3 +129,22 @@ pan_print_alu_type(nir_alu_type t, FILE *fp) fprintf(fp, "%u", size); } + +/* Could optimize with a better data structure if anyone cares, TODO: profile */ + +unsigned +pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs) +{ + struct panfrost_ubo_word word = { + .ubo = ubo, + .offset = offs + }; + + for (unsigned i = 0; i < push->count; ++i) { + if (memcmp(push->words + i, &word, sizeof(word)) == 0) + return i; + } + + unreachable("UBO not pushed"); + +} diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 02af7174911..9622a4bc890 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -97,6 +97,12 @@ struct panfrost_ubo_push { struct panfrost_ubo_word words[PAN_MAX_PUSH]; }; +/* Helper for searching the above. Note this is O(N) to the number of pushed + * constants, do not run in the draw call hot path */ + +unsigned +pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs); + void panfrost_nir_assign_sysvals(struct panfrost_sysvals *ctx, void *memctx, nir_shader *shader);