From a0b94209154ea98d1de04647f3e18b4426d8fb7c Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 7 Feb 2021 16:15:07 -0500 Subject: [PATCH] pan/mdg: Set lower_uniforms_to_ubo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes our custom load_uniform implementation and unifies the command stream side with Bifrost, preparing for additional optimizations. shader-db results are a wash. It's worth noting some of the increase in bundles is due to peephole select which is notoriously awkward for shader-db stats. total instructions in shared programs: 96611 -> 95613 (-1.03%) instructions in affected programs: 17562 -> 16564 (-5.68%) helped: 137 HURT: 13 helped stats (abs) min: 2 max: 27 x̄: 7.83 x̃: 7 helped stats (rel) min: 0.61% max: 20.00% x̄: 7.19% x̃: 5.75% HURT stats (abs) min: 1 max: 33 x̄: 5.77 x̃: 2 HURT stats (rel) min: 0.47% max: 15.64% x̄: 3.56% x̃: 1.72% 95% mean confidence interval for instructions value: -7.78 -5.53 95% mean confidence interval for instructions %-change: -7.13% -5.38% Instructions are helped. total bundles in shared programs: 44886 -> 45230 (0.77%) bundles in affected programs: 6649 -> 6993 (5.17%) helped: 54 HURT: 68 helped stats (abs) min: 1 max: 6 x̄: 2.35 x̃: 2 helped stats (rel) min: 1.04% max: 6.82% x̄: 4.37% x̃: 4.80% HURT stats (abs) min: 1 max: 16 x̄: 6.93 x̃: 6 HURT stats (rel) min: 2.22% max: 37.50% x̄: 14.03% x̃: 10.00% 95% mean confidence interval for bundles value: 1.78 3.85 95% mean confidence interval for bundles %-change: 3.73% 8.04% Bundles are HURT. total quadwords in shared programs: 76320 -> 75533 (-1.03%) quadwords in affected programs: 12404 -> 11617 (-6.34%) helped: 133 HURT: 3 helped stats (abs) min: 1 max: 18 x̄: 6.18 x̃: 6 helped stats (rel) min: 0.36% max: 18.18% x̄: 7.34% x̃: 7.45% HURT stats (abs) min: 1 max: 19 x̄: 11.67 x̃: 15 HURT stats (rel) min: 0.55% max: 15.79% x̄: 9.94% x̃: 13.48% 95% mean confidence interval for quadwords value: -6.41 -5.16 95% mean confidence interval for quadwords %-change: -7.58% -6.34% Quadwords are helped. total registers in shared programs: 6958 -> 6928 (-0.43%) registers in affected programs: 524 -> 494 (-5.73%) helped: 42 HURT: 15 helped stats (abs) min: 1 max: 2 x̄: 1.10 x̃: 1 helped stats (rel) min: 6.25% max: 25.00% x̄: 12.71% x̃: 12.50% HURT stats (abs) min: 1 max: 2 x̄: 1.07 x̃: 1 HURT stats (rel) min: 9.09% max: 20.00% x̄: 11.44% x̃: 11.11% 95% mean confidence interval for registers value: -0.79 -0.26 95% mean confidence interval for registers %-change: -9.35% -3.36% Registers are helped. total threads in shared programs: 5109 -> 5107 (-0.04%) threads in affected programs: 16 -> 14 (-12.50%) helped: 2 HURT: 6 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 100.00% max: 100.00% x̄: 100.00% x̃: 100.00% HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 50.00% max: 50.00% x̄: 50.00% x̃: 50.00% 95% mean confidence interval for threads value: -1.41 0.91 95% mean confidence interval for threads %-change: -70.55% 45.55% Inconclusive result (value mean confidence interval includes 0). Signed-off-by: Alyssa Rosenzweig WIP - do peephole ourselves Reviewed-by: Boris Brezillon Part-of: --- src/gallium/drivers/panfrost/pan_assemble.c | 14 +++++-------- src/panfrost/midgard/midgard_compile.c | 23 ++++++++++++++------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_assemble.c b/src/gallium/drivers/panfrost/pan_assemble.c index b1f74ab8a36..9508785bb1b 100644 --- a/src/gallium/drivers/panfrost/pan_assemble.c +++ b/src/gallium/drivers/panfrost/pan_assemble.c @@ -399,16 +399,12 @@ panfrost_shader_compile(struct panfrost_context *ctx, state->attribute_count = attribute_count; state->varying_count = varying_count; - /* off-by-one for uniforms. Not needed on Bifrost since uniforms - * have been lowered to UBOs using nir_lower_uniforms_to_ubo() which - * already increments s->info.num_ubos. We do have to account for the - * "no uniform, no UBO" case though, otherwise sysval passed through - * uniforms won't work correctly. + /* Uniforms have been lowered to UBOs using nir_lower_uniforms_to_ubo() + * which already increments s->info.num_ubos. We do have to account for + * the "no uniform, no UBO" case though, otherwise sysval passed + * through uniforms won't work correctly. */ - if (pan_is_bifrost(dev)) - state->ubo_count = MAX2(s->info.num_ubos, 1); - else - state->ubo_count = s->info.num_ubos + 1; + state->ubo_count = MAX2(s->info.num_ubos, 1); /* Prepare the descriptors at compile-time */ state->shader.shader = shader; diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index f23baf99200..39d28a9e649 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -320,6 +320,15 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend) NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early); + /* Peephole select is more effective before lowering uniforms to UBO, + * so do a round of that, and then call lower_uniforms_to_ubo + * explicitly (instead of relying on the state tracker to do it). Note + * the state tracker does run peephole_select before lowering uniforms + * to UBO ordinarily, but it isn't as aggressive as we need. */ + + NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); + NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16); + do { progress = false; @@ -1653,7 +1662,6 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) break; } - case nir_intrinsic_load_uniform: case nir_intrinsic_load_ubo: case nir_intrinsic_load_global: case nir_intrinsic_load_global_constant: @@ -1662,7 +1670,6 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_load_input: case nir_intrinsic_load_kernel_input: case nir_intrinsic_load_interpolated_input: { - bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform; bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; bool is_global = instr->intrinsic == nir_intrinsic_load_global || instr->intrinsic == nir_intrinsic_load_global_constant; @@ -1676,7 +1683,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) /* TODO: Infer type? Does it matter? */ nir_alu_type t = (is_interp) ? nir_type_float : - (is_uniform || is_flat) ? nir_intrinsic_dest_type(instr) : + (is_flat) ? nir_intrinsic_dest_type(instr) : nir_type_uint; t = nir_alu_type_get_base_type(t); @@ -1700,9 +1707,7 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) nir_intrinsic_component(instr) : 0; reg = nir_dest_index(&instr->dest); - if (is_uniform && !ctx->is_blend) { - emit_ubo_read(ctx, &instr->instr, reg, (ctx->sysvals.sysval_count + offset) * 16, indirect_offset, 4, 0); - } else if (is_kernel) { + if (is_kernel) { emit_ubo_read(ctx, &instr->instr, reg, (ctx->sysvals.sysval_count * 16) + offset, indirect_offset, 0, 0); } else if (is_ubo) { nir_src index = instr->src[0]; @@ -1710,7 +1715,11 @@ emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) /* TODO: Is indirect block number possible? */ assert(nir_src_is_const(index)); - uint32_t uindex = nir_src_as_uint(index) + 1; + uint32_t uindex = nir_src_as_uint(index); + + if (uindex == 0) + offset += ctx->sysvals.sysval_count * 16; + emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex); } else if (is_global || is_shared || is_scratch) { unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH);