broadcom/compiler: add a strategy to disable scheduling of general TMU reads

This can add quite a bit of register pressure so it makes sense to disable it
to prevent us from dropping to 2 threads or increase spills:

total instructions in shared programs: 12672813 -> 12642413 (-0.24%)
instructions in affected programs: 256721 -> 226321 (-11.84%)
helped: 719
HURT: 77

total threads in shared programs: 415534 -> 416322 (0.19%)
threads in affected programs: 788 -> 1576 (100.00%)
helped: 394
HURT: 0

total uniforms in shared programs: 3711370 -> 3703861 (-0.20%)
uniforms in affected programs: 28859 -> 21350 (-26.02%)
helped: 204
HURT: 455

total max-temps in shared programs: 2159439 -> 2150686 (-0.41%)
max-temps in affected programs: 32945 -> 24192 (-26.57%)
helped: 585
HURT: 47

total spills in shared programs: 5966 -> 3255 (-45.44%)
spills in affected programs: 2933 -> 222 (-92.43%)
helped: 192
HURT: 4

total fills in shared programs: 9328 -> 4630 (-50.36%)
fills in affected programs: 5184 -> 486 (-90.62%)
helped: 196
HURT: 0

Compared to the stats before adding scheduling of non-filtered
memory reads we see we that we have now gotten back all that was
lost and then some:

total instructions in shared programs: 12663186 -> 12642413 (-0.16%)
instructions in affected programs: 2051803 -> 2031030 (-1.01%)
helped: 4885
HURT: 3338

total threads in shared programs: 415870 -> 416322 (0.11%)
threads in affected programs: 896 -> 1348 (50.45%)
helped: 300
HURT: 74

total uniforms in shared programs: 3711629 -> 3703861 (-0.21%)
uniforms in affected programs: 158766 -> 150998 (-4.89%)
helped: 1973
HURT: 499

total max-temps in shared programs: 2138857 -> 2150686 (0.55%)
max-temps in affected programs: 177920 -> 189749 (6.65%)
helped: 2666
HURT: 2035

total spills in shared programs: 3860 -> 3255 (-15.67%)
spills in affected programs: 2653 -> 2048 (-22.80%)
helped: 77
HURT: 21

total fills in shared programs: 5573 -> 4630 (-16.92%)
fills in affected programs: 3839 -> 2896 (-24.56%)
helped: 81
HURT: 15

total sfu-stalls in shared programs: 39583 -> 38154 (-3.61%)
sfu-stalls in affected programs: 8993 -> 7564 (-15.89%)
helped: 1808
HURT: 1038

total nops in shared programs: 324894 -> 323685 (-0.37%)
nops in affected programs: 30362 -> 29153 (-3.98%)
helped: 2513
HURT: 2077

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15276>
This commit is contained in:
Iago Toral Quiroga 2022-03-07 14:42:39 +01:00 committed by Marge Bot
parent f783bd0d2a
commit a35b47a0b1
3 changed files with 67 additions and 32 deletions

View file

@ -3200,8 +3200,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ssbo:
if (!ntq_emit_load_unifa(c, instr))
if (!ntq_emit_load_unifa(c, instr)) {
ntq_emit_tmu_general(c, instr, false);
c->has_general_tmu_load = true;
}
break;
case nir_intrinsic_ssbo_atomic_add:
@ -3228,14 +3230,17 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_shared_atomic_xor:
case nir_intrinsic_shared_atomic_exchange:
case nir_intrinsic_shared_atomic_comp_swap:
case nir_intrinsic_load_shared:
case nir_intrinsic_store_shared:
case nir_intrinsic_load_scratch:
case nir_intrinsic_store_scratch:
ntq_emit_tmu_general(c, instr, true);
break;
case nir_intrinsic_image_load:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_shared:
ntq_emit_tmu_general(c, instr, true);
c->has_general_tmu_load = true;
break;
case nir_intrinsic_image_store:
case nir_intrinsic_image_atomic_add:
case nir_intrinsic_image_atomic_imin:
@ -3250,6 +3255,15 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
v3d40_vir_emit_image_load_store(c, instr);
break;
case nir_intrinsic_image_load:
v3d40_vir_emit_image_load_store(c, instr);
/* Not really a general TMU load, but we only use this flag
* for NIR scheduling and we do schedule these under the same
* policy as general TMU.
*/
c->has_general_tmu_load = true;
break;
case nir_intrinsic_get_ssbo_size:
ntq_store_dest(c, &instr->dest, 0,
vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,

View file

@ -710,6 +710,11 @@ struct v3d_compile {
bool disable_loop_unrolling;
bool unrolled_any_loops;
/* Disables scheduling of general TMU loads (and unfiltered image load).
*/
bool disable_general_tmu_sched;
bool has_general_tmu_load;
/* Minimum number of threads we are willing to use to register allocate
* a shader with the current compilation strategy. This only prevents
* us from lowering the thread count to register allocate successfully,

View file

@ -550,6 +550,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
uint32_t max_threads,
uint32_t min_threads_for_reg_alloc,
uint32_t max_tmu_spills,
bool disable_general_tmu_sched,
bool disable_loop_unrolling,
bool disable_constant_ubo_load_sorting,
bool disable_tmu_pipelining,
@ -569,6 +570,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
c->max_tmu_spills = max_tmu_spills;
c->fallback_scheduler = fallback_scheduler;
c->disable_general_tmu_sched = disable_general_tmu_sched;
c->disable_tmu_pipelining = disable_tmu_pipelining;
c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
@ -1122,6 +1124,8 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
static unsigned
v3d_instr_delay_cb(nir_instr *instr, void *data)
{
struct v3d_compile *c = (struct v3d_compile *) data;
switch (instr->type) {
case nir_instr_type_ssa_undef:
case nir_instr_type_load_const:
@ -1134,18 +1138,22 @@ v3d_instr_delay_cb(nir_instr *instr, void *data)
return 1;
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
switch (intr->intrinsic) {
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_shared:
case nir_intrinsic_image_load:
return 30;
case nir_intrinsic_load_ubo:
if (nir_src_is_divergent(intr->src[1]))
if (!c->disable_general_tmu_sched) {
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
switch (intr->intrinsic) {
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_shared:
case nir_intrinsic_image_load:
return 30;
FALLTHROUGH;
default:
case nir_intrinsic_load_ubo:
if (nir_src_is_divergent(intr->src[1]))
return 30;
FALLTHROUGH;
default:
return 1;
}
} else {
return 1;
}
break;
@ -1674,20 +1682,23 @@ struct v3d_compiler_strategy {
const char *name;
uint32_t max_threads;
uint32_t min_threads;
bool disable_general_tmu_sched;
bool disable_loop_unrolling;
bool disable_ubo_load_sorting;
bool disable_tmu_pipelining;
uint32_t max_tmu_spills;
} static const strategies[] = {
/*0*/ { "default", 4, 4, false, false, false, 0 },
/*1*/ { "disable loop unrolling", 4, 4, true, false, false, 0 },
/*2*/ { "disable UBO load sorting", 4, 4, true, true, false, 0 },
/*3*/ { "disable TMU pipelining", 4, 4, true, true, true, 0 },
/*4*/ { "lower thread count", 2, 1, false, false, false, -1 },
/*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, -1 },
/*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, -1 },
/*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, -1 },
/*8*/ { "fallback scheduler", 2, 1, true, true, true, -1 }
/*0*/ { "default", 4, 4, false, false, false, false, 0 },
/*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, 0 },
/*2*/ { "disable loop unrolling", 4, 4, true, true, false, false, 0 },
/*3*/ { "disable UBO load sorting", 4, 4, true, true, true, false, 0 },
/*4*/ { "disable TMU pipelining", 4, 4, true, true, true, true, 0 },
/*5*/ { "lower thread count", 2, 1, false, false, false, false, -1 },
/*6*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, -1 },
/*7*/ { "disable loop unrolling (2t)", 2, 1, true, true, false, false, -1 },
/*8*/ { "disable UBO load sorting (2t)", 2, 1, true, true, true, false, -1 },
/*9*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, -1 },
/*10*/ { "fallback scheduler", 2, 1, true, true, true, true, -1 }
};
/**
@ -1695,7 +1706,7 @@ struct v3d_compiler_strategy {
* attempt disabling it alone won't allow us to compile the shader successfuly,
* since we'll end up with the same code. Detect these scenarios so we can
* avoid wasting time with useless compiles. We should also consider if the
* strategy changes other aspects of the compilation process though, like
* gy changes other aspects of the compilation process though, like
* spilling, and not skip it in that case.
*/
static bool
@ -1714,20 +1725,24 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
}
switch (idx) {
/* Loop unrolling: skip if we didn't unroll any loops */
/* General TMU sched.: skip if we didn't emit any TMU loads */
case 1:
case 5:
case 6:
return !c->has_general_tmu_load;
/* Loop unrolling: skip if we didn't unroll any loops */
case 2:
case 7:
return !c->unrolled_any_loops;
/* UBO load sorting: skip if we didn't sort any loads */
case 2:
case 6:
case 3:
case 8:
return !c->sorted_any_ubo_loads;
/* TMU pipelining: skip if we didn't pipeline any TMU ops */
case 3:
case 7:
case 4:
case 9:
return !c->pipelined_any_tmu;
/* Lower thread count: skip if we already tried less that 4 threads */
case 4:
case 5:
return c->threads < 4;
default:
return false;
@ -1780,6 +1795,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
strategies[strat].max_threads,
strategies[strat].min_threads,
strategies[strat].max_tmu_spills,
strategies[strat].disable_general_tmu_sched,
strategies[strat].disable_loop_unrolling,
strategies[strat].disable_ubo_load_sorting,
strategies[strat].disable_tmu_pipelining,