mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-30 16:30:10 +01:00
pan/mdg: Push uniforms based on UBO analysis
Skips over "holes" in UBO ranges and allows pushing things other than UBO #0 (GL uniforms) and sysvals. shader-db results relative to beginning of series (so includes the hurt from lowering UBO to uniforms): total instructions in shared programs: 96611 -> 95018 (-1.65%) instructions in affected programs: 22356 -> 20763 (-7.13%) helped: 204 HURT: 13 helped stats (abs) min: 1 max: 27 x̄: 8.18 x̃: 7 helped stats (rel) min: 0.42% max: 26.09% x̄: 8.60% x̃: 8.07% HURT stats (abs) min: 1 max: 33 x̄: 5.77 x̃: 2 HURT stats (rel) min: 0.47% max: 15.64% x̄: 3.56% x̃: 1.72% 95% mean confidence interval for instructions value: -8.29 -6.39 95% mean confidence interval for instructions %-change: -8.74% -7.00% Instructions are helped. total bundles in shared programs: 44886 -> 44790 (-0.21%) bundles in affected programs: 9640 -> 9544 (-1.00%) helped: 131 HURT: 70 helped stats (abs) min: 1 max: 11 x̄: 4.34 x̃: 4 helped stats (rel) min: 1.04% max: 42.31% x̄: 10.39% x̃: 9.84% HURT stats (abs) min: 1 max: 16 x̄: 6.76 x̃: 6 HURT stats (rel) min: 2.22% max: 37.50% x̄: 13.78% x̃: 10.00% 95% mean confidence interval for bundles value: -1.37 0.42 95% mean confidence interval for bundles %-change: -3.99% 0.04% Inconclusive result (value mean confidence interval includes 0). total quadwords in shared programs: 76320 -> 75140 (-1.55%) quadwords in affected programs: 16691 -> 15511 (-7.07%) helped: 206 HURT: 5 helped stats (abs) min: 1 max: 18 x̄: 5.91 x̃: 6 helped stats (rel) min: 0.36% max: 27.78% x̄: 7.93% x̃: 8.33% HURT stats (abs) min: 1 max: 19 x̄: 7.40 x̃: 1 HURT stats (rel) min: 0.55% max: 15.79% x̄: 7.39% x̃: 3.57% 95% mean confidence interval for quadwords value: -6.19 -5.00 95% mean confidence interval for quadwords %-change: -8.32% -6.82% Quadwords are helped. total registers in shared programs: 6958 -> 6827 (-1.88%) registers in affected programs: 1083 -> 952 (-12.10%) helped: 112 HURT: 16 helped stats (abs) min: 1 max: 3 x̄: 1.32 x̃: 1 helped stats (rel) min: 6.25% max: 50.00% x̄: 17.13% x̃: 12.50% HURT stats (abs) min: 1 max: 2 x̄: 1.06 x̃: 1 HURT stats (rel) min: 9.09% max: 20.00% x̄: 11.97% x̃: 11.81% 95% mean confidence interval for registers value: -1.19 -0.86 95% mean confidence interval for registers %-change: -15.78% -11.21% Registers are helped. total threads in shared programs: 5109 -> 5153 (0.86%) threads in affected programs: 62 -> 106 (70.97%) helped: 42 HURT: 6 helped stats (abs) min: 1 max: 2 x̄: 1.19 x̃: 1 helped stats (rel) min: 100.00% max: 100.00% x̄: 100.00% x̃: 100.00% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 50.00% max: 50.00% x̄: 50.00% x̃: 50.00% 95% mean confidence interval for threads value: 0.68 1.16 95% mean confidence interval for threads %-change: 66.69% 95.81% Threads are helped. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8973>
This commit is contained in:
parent
4c65067150
commit
755227baa6
7 changed files with 163 additions and 23 deletions
|
|
@ -321,6 +321,7 @@ typedef struct compiler_context {
|
|||
midgard_instruction *writeout_branch[MIDGARD_NUM_RTS][MIDGARD_MAX_SAMPLE_ITER];
|
||||
|
||||
struct panfrost_sysvals sysvals;
|
||||
struct panfrost_ubo_push *push;
|
||||
} compiler_context;
|
||||
|
||||
/* Per-block live_in/live_out */
|
||||
|
|
|
|||
|
|
@ -2986,6 +2986,8 @@ midgard_compile_shader_nir(void *mem_ctx, nir_shader *nir,
|
|||
ctx->stage = nir->info.stage;
|
||||
ctx->is_blend = inputs->is_blend;
|
||||
ctx->blend_rt = MIDGARD_COLOR_RT0 + inputs->blend.rt;
|
||||
ctx->push = &program->push;
|
||||
|
||||
if (inputs->is_blend) {
|
||||
unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1);
|
||||
const struct util_format_description *desc =
|
||||
|
|
|
|||
|
|
@ -977,6 +977,8 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff)
|
|||
midgard_instruction *before = ins;
|
||||
|
||||
unsigned temp = make_compiler_temp(ctx);
|
||||
unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4;
|
||||
assert(idx < ctx->push->count);
|
||||
|
||||
midgard_instruction ld = {
|
||||
.type = TAG_LOAD_STORE_4,
|
||||
|
|
@ -987,12 +989,12 @@ mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff)
|
|||
.swizzle = SWIZZLE_IDENTITY_4,
|
||||
.op = midgard_op_ld_ubo_int4,
|
||||
.load_store = {
|
||||
.arg_1 = ctx->push->words[idx].ubo,
|
||||
.arg_2 = 0x1E,
|
||||
},
|
||||
.constants.u32[0] = ctx->push->words[idx].offset
|
||||
};
|
||||
|
||||
ld.constants.u32[0] = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 16;
|
||||
|
||||
mir_insert_instruction_before_scheduled(ctx, block, before, ld);
|
||||
|
||||
mir_rewrite_index_src_single(ins, ins->src[i], temp);
|
||||
|
|
|
|||
|
|
@ -1447,7 +1447,7 @@ schedule_block(compiler_context *ctx, midgard_block *block)
|
|||
void
|
||||
midgard_schedule_program(compiler_context *ctx)
|
||||
{
|
||||
// midgard_promote_uniforms(ctx);
|
||||
midgard_promote_uniforms(ctx);
|
||||
|
||||
/* Must be lowered right before scheduling */
|
||||
mir_squeeze_index(ctx);
|
||||
|
|
|
|||
|
|
@ -37,26 +37,124 @@
|
|||
*/
|
||||
|
||||
static bool
|
||||
mir_is_promoteable_ubo(midgard_instruction *ins)
|
||||
mir_is_direct_aligned_ubo(midgard_instruction *ins)
|
||||
{
|
||||
/* TODO: promote unaligned access via swizzle? */
|
||||
|
||||
return (ins->type == TAG_LOAD_STORE_4) &&
|
||||
(OP_IS_UBO_READ(ins->op)) &&
|
||||
!(ins->constants.u32[0] & 0xF) &&
|
||||
!(ins->load_store.arg_1) &&
|
||||
(ins->load_store.arg_2 == 0x1E) &&
|
||||
((ins->constants.u32[0] / 16) < 16);
|
||||
(ins->src[1] == ~0) &&
|
||||
(ins->src[2] == ~0);
|
||||
}
|
||||
|
||||
/* Represents use data for a single UBO */
|
||||
|
||||
#define MAX_UBO_QWORDS (65536 / 16)
|
||||
|
||||
struct mir_ubo_block {
|
||||
BITSET_DECLARE(uses, MAX_UBO_QWORDS);
|
||||
BITSET_DECLARE(pushed, MAX_UBO_QWORDS);
|
||||
};
|
||||
|
||||
struct mir_ubo_analysis {
|
||||
/* Per block analysis */
|
||||
unsigned nr_blocks;
|
||||
struct mir_ubo_block *blocks;
|
||||
};
|
||||
|
||||
static struct mir_ubo_analysis
|
||||
mir_analyze_ranges(compiler_context *ctx)
|
||||
{
|
||||
struct mir_ubo_analysis res = {
|
||||
.nr_blocks = ctx->nir->info.num_ubos + 1,
|
||||
};
|
||||
|
||||
res.blocks = calloc(res.nr_blocks, sizeof(struct mir_ubo_block));
|
||||
|
||||
mir_foreach_instr_global(ctx, ins) {
|
||||
if (!mir_is_direct_aligned_ubo(ins)) continue;
|
||||
|
||||
unsigned ubo = ins->load_store.arg_1;
|
||||
unsigned offset = ins->constants.u32[0] / 16;
|
||||
|
||||
assert(ubo < res.nr_blocks);
|
||||
|
||||
if (offset < MAX_UBO_QWORDS)
|
||||
BITSET_SET(res.blocks[ubo].uses, offset);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Select UBO words to push. A sophisticated implementation would consider the
|
||||
* number of uses and perhaps the control flow to estimate benefit. This is not
|
||||
* sophisticated. Select from the last UBO first to prioritize sysvals. */
|
||||
|
||||
static void
|
||||
mir_pick_ubo(struct panfrost_ubo_push *push, struct mir_ubo_analysis *analysis, unsigned max_qwords)
|
||||
{
|
||||
unsigned max_words = MIN2(PAN_MAX_PUSH, max_qwords * 4);
|
||||
|
||||
for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) {
|
||||
struct mir_ubo_block *block = &analysis->blocks[ubo];
|
||||
|
||||
unsigned vec4;
|
||||
BITSET_FOREACH_SET(vec4, block->uses, MAX_UBO_QWORDS) {
|
||||
/* Don't push more than possible */
|
||||
if (push->count > max_words - 4)
|
||||
return;
|
||||
|
||||
for (unsigned offs = 0; offs < 4; ++offs) {
|
||||
struct panfrost_ubo_word word = {
|
||||
.ubo = ubo,
|
||||
.offset = (vec4 * 16) + (offs * 4)
|
||||
};
|
||||
|
||||
push->words[push->count++] = word;
|
||||
}
|
||||
|
||||
/* Mark it as pushed so we can rewrite */
|
||||
BITSET_SET(block->pushed, vec4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
static void
|
||||
mir_dump_ubo_analysis(struct mir_ubo_analysis *res)
|
||||
{
|
||||
printf("%u blocks\n", res->nr_blocks);
|
||||
|
||||
for (unsigned i = 0; i < res->nr_blocks; ++i) {
|
||||
BITSET_WORD *uses = res->blocks[i].uses;
|
||||
BITSET_WORD *push = res->blocks[i].pushed;
|
||||
|
||||
unsigned last = BITSET_LAST_BIT(uses, BITSET_WORDS(MAX_UBO_QWORDS));
|
||||
|
||||
printf("\t");
|
||||
|
||||
for (unsigned j = 0; j < last; ++j) {
|
||||
bool used = BITSET_TEST(uses, j);
|
||||
bool pushed = BITSET_TEST(push, j);
|
||||
assert(used || !pushed);
|
||||
|
||||
putchar(pushed ? '*' : used ? '-' : '_');
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static unsigned
|
||||
mir_promoteable_uniform_count(compiler_context *ctx)
|
||||
mir_promoteable_uniform_count(struct mir_ubo_analysis *analysis)
|
||||
{
|
||||
unsigned count = 0;
|
||||
|
||||
mir_foreach_instr_global(ctx, ins) {
|
||||
if (mir_is_promoteable_ubo(ins))
|
||||
count = MAX2(count, ins->constants.u32[0] / 16);
|
||||
for (unsigned i = 0; i < analysis->nr_blocks; ++i) {
|
||||
BITSET_WORD *uses = analysis->blocks[i].uses;
|
||||
|
||||
for (unsigned w = 0; w < BITSET_WORDS(MAX_UBO_QWORDS); ++w)
|
||||
count += util_bitcount(uses[w]);
|
||||
}
|
||||
|
||||
return count;
|
||||
|
|
@ -98,9 +196,9 @@ mir_estimate_pressure(compiler_context *ctx)
|
|||
}
|
||||
|
||||
static unsigned
|
||||
mir_work_heuristic(compiler_context *ctx)
|
||||
mir_work_heuristic(compiler_context *ctx, struct mir_ubo_analysis *analysis)
|
||||
{
|
||||
unsigned uniform_count = mir_promoteable_uniform_count(ctx);
|
||||
unsigned uniform_count = mir_promoteable_uniform_count(analysis);
|
||||
|
||||
/* If there are 8 or fewer uniforms, it doesn't matter what we do, so
|
||||
* allow as many work registers as needed */
|
||||
|
|
@ -160,24 +258,35 @@ mir_special_indices(compiler_context *ctx)
|
|||
void
|
||||
midgard_promote_uniforms(compiler_context *ctx)
|
||||
{
|
||||
unsigned work_count = mir_work_heuristic(ctx);
|
||||
struct mir_ubo_analysis analysis = mir_analyze_ranges(ctx);
|
||||
|
||||
unsigned work_count = mir_work_heuristic(ctx, &analysis);
|
||||
unsigned promoted_count = 24 - work_count;
|
||||
|
||||
mir_pick_ubo(ctx->push, &analysis, promoted_count);
|
||||
|
||||
/* First, figure out special indices a priori so we don't recompute a lot */
|
||||
BITSET_WORD *special = mir_special_indices(ctx);
|
||||
|
||||
mir_foreach_instr_global_safe(ctx, ins) {
|
||||
if (!mir_is_promoteable_ubo(ins)) continue;
|
||||
if (!mir_is_direct_aligned_ubo(ins)) continue;
|
||||
|
||||
unsigned off = ins->constants.u32[0];
|
||||
unsigned address = off / 16;
|
||||
unsigned ubo = ins->load_store.arg_1;
|
||||
unsigned qword = ins->constants.u32[0] / 16;
|
||||
|
||||
/* Check if it's a promotable range */
|
||||
/* Check if we decided to push this */
|
||||
assert(ubo < analysis.nr_blocks);
|
||||
if (!BITSET_TEST(analysis.blocks[ubo].pushed, qword)) continue;
|
||||
|
||||
/* Find where we pushed to, TODO: unaligned pushes to pack */
|
||||
unsigned base = pan_lookup_pushed_ubo(ctx->push, ubo, qword * 16);
|
||||
assert((base & 0x3) == 0);
|
||||
|
||||
unsigned address = base / 4;
|
||||
unsigned uniform_reg = 23 - address;
|
||||
|
||||
if (address >= promoted_count) continue;
|
||||
|
||||
/* It is, great! Let's promote */
|
||||
/* Should've taken into account when pushing */
|
||||
assert(address < promoted_count);
|
||||
|
||||
ctx->uniform_cutoff = MAX2(ctx->uniform_cutoff, address + 1);
|
||||
unsigned promoted = SSA_FIXED_REGISTER(uniform_reg);
|
||||
|
|
@ -207,4 +316,5 @@ midgard_promote_uniforms(compiler_context *ctx)
|
|||
}
|
||||
|
||||
free(special);
|
||||
free(analysis.blocks);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -129,3 +129,22 @@ pan_print_alu_type(nir_alu_type t, FILE *fp)
|
|||
|
||||
fprintf(fp, "%u", size);
|
||||
}
|
||||
|
||||
/* Could optimize with a better data structure if anyone cares, TODO: profile */
|
||||
|
||||
unsigned
|
||||
pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs)
|
||||
{
|
||||
struct panfrost_ubo_word word = {
|
||||
.ubo = ubo,
|
||||
.offset = offs
|
||||
};
|
||||
|
||||
for (unsigned i = 0; i < push->count; ++i) {
|
||||
if (memcmp(push->words + i, &word, sizeof(word)) == 0)
|
||||
return i;
|
||||
}
|
||||
|
||||
unreachable("UBO not pushed");
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -97,6 +97,12 @@ struct panfrost_ubo_push {
|
|||
struct panfrost_ubo_word words[PAN_MAX_PUSH];
|
||||
};
|
||||
|
||||
/* Helper for searching the above. Note this is O(N) to the number of pushed
|
||||
* constants, do not run in the draw call hot path */
|
||||
|
||||
unsigned
|
||||
pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs);
|
||||
|
||||
void
|
||||
panfrost_nir_assign_sysvals(struct panfrost_sysvals *ctx, void *memctx, nir_shader *shader);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue