mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-01 03:48:06 +02:00
broadcom/compiler: add a strategy to disable scheduling of general TMU reads
This can add quite a bit of register pressure so it makes sense to disable it to prevent us from dropping to 2 threads or increase spills: total instructions in shared programs: 12672813 -> 12642413 (-0.24%) instructions in affected programs: 256721 -> 226321 (-11.84%) helped: 719 HURT: 77 total threads in shared programs: 415534 -> 416322 (0.19%) threads in affected programs: 788 -> 1576 (100.00%) helped: 394 HURT: 0 total uniforms in shared programs: 3711370 -> 3703861 (-0.20%) uniforms in affected programs: 28859 -> 21350 (-26.02%) helped: 204 HURT: 455 total max-temps in shared programs: 2159439 -> 2150686 (-0.41%) max-temps in affected programs: 32945 -> 24192 (-26.57%) helped: 585 HURT: 47 total spills in shared programs: 5966 -> 3255 (-45.44%) spills in affected programs: 2933 -> 222 (-92.43%) helped: 192 HURT: 4 total fills in shared programs: 9328 -> 4630 (-50.36%) fills in affected programs: 5184 -> 486 (-90.62%) helped: 196 HURT: 0 Compared to the stats before adding scheduling of non-filtered memory reads we see we that we have now gotten back all that was lost and then some: total instructions in shared programs: 12663186 -> 12642413 (-0.16%) instructions in affected programs: 2051803 -> 2031030 (-1.01%) helped: 4885 HURT: 3338 total threads in shared programs: 415870 -> 416322 (0.11%) threads in affected programs: 896 -> 1348 (50.45%) helped: 300 HURT: 74 total uniforms in shared programs: 3711629 -> 3703861 (-0.21%) uniforms in affected programs: 158766 -> 150998 (-4.89%) helped: 1973 HURT: 499 total max-temps in shared programs: 2138857 -> 2150686 (0.55%) max-temps in affected programs: 177920 -> 189749 (6.65%) helped: 2666 HURT: 2035 total spills in shared programs: 3860 -> 3255 (-15.67%) spills in affected programs: 2653 -> 2048 (-22.80%) helped: 77 HURT: 21 total fills in shared programs: 5573 -> 4630 (-16.92%) fills in affected programs: 3839 -> 2896 (-24.56%) helped: 81 HURT: 15 total sfu-stalls in shared programs: 39583 -> 38154 (-3.61%) sfu-stalls in affected programs: 8993 -> 7564 (-15.89%) helped: 1808 HURT: 1038 total nops in shared programs: 324894 -> 323685 (-0.37%) nops in affected programs: 30362 -> 29153 (-3.98%) helped: 2513 HURT: 2077 Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15276>
This commit is contained in:
parent
f783bd0d2a
commit
a35b47a0b1
3 changed files with 67 additions and 32 deletions
|
|
@ -3200,8 +3200,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
|
|||
|
||||
case nir_intrinsic_load_ubo:
|
||||
case nir_intrinsic_load_ssbo:
|
||||
if (!ntq_emit_load_unifa(c, instr))
|
||||
if (!ntq_emit_load_unifa(c, instr)) {
|
||||
ntq_emit_tmu_general(c, instr, false);
|
||||
c->has_general_tmu_load = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case nir_intrinsic_ssbo_atomic_add:
|
||||
|
|
@ -3228,14 +3230,17 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
|
|||
case nir_intrinsic_shared_atomic_xor:
|
||||
case nir_intrinsic_shared_atomic_exchange:
|
||||
case nir_intrinsic_shared_atomic_comp_swap:
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_store_shared:
|
||||
case nir_intrinsic_load_scratch:
|
||||
case nir_intrinsic_store_scratch:
|
||||
ntq_emit_tmu_general(c, instr, true);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_image_load:
|
||||
case nir_intrinsic_load_scratch:
|
||||
case nir_intrinsic_load_shared:
|
||||
ntq_emit_tmu_general(c, instr, true);
|
||||
c->has_general_tmu_load = true;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_image_store:
|
||||
case nir_intrinsic_image_atomic_add:
|
||||
case nir_intrinsic_image_atomic_imin:
|
||||
|
|
@ -3250,6 +3255,15 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
|
|||
v3d40_vir_emit_image_load_store(c, instr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_image_load:
|
||||
v3d40_vir_emit_image_load_store(c, instr);
|
||||
/* Not really a general TMU load, but we only use this flag
|
||||
* for NIR scheduling and we do schedule these under the same
|
||||
* policy as general TMU.
|
||||
*/
|
||||
c->has_general_tmu_load = true;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_get_ssbo_size:
|
||||
ntq_store_dest(c, &instr->dest, 0,
|
||||
vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
|
||||
|
|
|
|||
|
|
@ -710,6 +710,11 @@ struct v3d_compile {
|
|||
bool disable_loop_unrolling;
|
||||
bool unrolled_any_loops;
|
||||
|
||||
/* Disables scheduling of general TMU loads (and unfiltered image load).
|
||||
*/
|
||||
bool disable_general_tmu_sched;
|
||||
bool has_general_tmu_load;
|
||||
|
||||
/* Minimum number of threads we are willing to use to register allocate
|
||||
* a shader with the current compilation strategy. This only prevents
|
||||
* us from lowering the thread count to register allocate successfully,
|
||||
|
|
|
|||
|
|
@ -550,6 +550,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
|
|||
uint32_t max_threads,
|
||||
uint32_t min_threads_for_reg_alloc,
|
||||
uint32_t max_tmu_spills,
|
||||
bool disable_general_tmu_sched,
|
||||
bool disable_loop_unrolling,
|
||||
bool disable_constant_ubo_load_sorting,
|
||||
bool disable_tmu_pipelining,
|
||||
|
|
@ -569,6 +570,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
|
|||
c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
|
||||
c->max_tmu_spills = max_tmu_spills;
|
||||
c->fallback_scheduler = fallback_scheduler;
|
||||
c->disable_general_tmu_sched = disable_general_tmu_sched;
|
||||
c->disable_tmu_pipelining = disable_tmu_pipelining;
|
||||
c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
|
||||
c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
|
||||
|
|
@ -1122,6 +1124,8 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
|
|||
static unsigned
|
||||
v3d_instr_delay_cb(nir_instr *instr, void *data)
|
||||
{
|
||||
struct v3d_compile *c = (struct v3d_compile *) data;
|
||||
|
||||
switch (instr->type) {
|
||||
case nir_instr_type_ssa_undef:
|
||||
case nir_instr_type_load_const:
|
||||
|
|
@ -1134,18 +1138,22 @@ v3d_instr_delay_cb(nir_instr *instr, void *data)
|
|||
return 1;
|
||||
|
||||
case nir_instr_type_intrinsic: {
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_ssbo:
|
||||
case nir_intrinsic_load_scratch:
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_image_load:
|
||||
return 30;
|
||||
case nir_intrinsic_load_ubo:
|
||||
if (nir_src_is_divergent(intr->src[1]))
|
||||
if (!c->disable_general_tmu_sched) {
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_ssbo:
|
||||
case nir_intrinsic_load_scratch:
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_image_load:
|
||||
return 30;
|
||||
FALLTHROUGH;
|
||||
default:
|
||||
case nir_intrinsic_load_ubo:
|
||||
if (nir_src_is_divergent(intr->src[1]))
|
||||
return 30;
|
||||
FALLTHROUGH;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
|
|
@ -1674,20 +1682,23 @@ struct v3d_compiler_strategy {
|
|||
const char *name;
|
||||
uint32_t max_threads;
|
||||
uint32_t min_threads;
|
||||
bool disable_general_tmu_sched;
|
||||
bool disable_loop_unrolling;
|
||||
bool disable_ubo_load_sorting;
|
||||
bool disable_tmu_pipelining;
|
||||
uint32_t max_tmu_spills;
|
||||
} static const strategies[] = {
|
||||
/*0*/ { "default", 4, 4, false, false, false, 0 },
|
||||
/*1*/ { "disable loop unrolling", 4, 4, true, false, false, 0 },
|
||||
/*2*/ { "disable UBO load sorting", 4, 4, true, true, false, 0 },
|
||||
/*3*/ { "disable TMU pipelining", 4, 4, true, true, true, 0 },
|
||||
/*4*/ { "lower thread count", 2, 1, false, false, false, -1 },
|
||||
/*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, -1 },
|
||||
/*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, -1 },
|
||||
/*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, -1 },
|
||||
/*8*/ { "fallback scheduler", 2, 1, true, true, true, -1 }
|
||||
/*0*/ { "default", 4, 4, false, false, false, false, 0 },
|
||||
/*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, 0 },
|
||||
/*2*/ { "disable loop unrolling", 4, 4, true, true, false, false, 0 },
|
||||
/*3*/ { "disable UBO load sorting", 4, 4, true, true, true, false, 0 },
|
||||
/*4*/ { "disable TMU pipelining", 4, 4, true, true, true, true, 0 },
|
||||
/*5*/ { "lower thread count", 2, 1, false, false, false, false, -1 },
|
||||
/*6*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, -1 },
|
||||
/*7*/ { "disable loop unrolling (2t)", 2, 1, true, true, false, false, -1 },
|
||||
/*8*/ { "disable UBO load sorting (2t)", 2, 1, true, true, true, false, -1 },
|
||||
/*9*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, -1 },
|
||||
/*10*/ { "fallback scheduler", 2, 1, true, true, true, true, -1 }
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -1695,7 +1706,7 @@ struct v3d_compiler_strategy {
|
|||
* attempt disabling it alone won't allow us to compile the shader successfuly,
|
||||
* since we'll end up with the same code. Detect these scenarios so we can
|
||||
* avoid wasting time with useless compiles. We should also consider if the
|
||||
* strategy changes other aspects of the compilation process though, like
|
||||
* gy changes other aspects of the compilation process though, like
|
||||
* spilling, and not skip it in that case.
|
||||
*/
|
||||
static bool
|
||||
|
|
@ -1714,20 +1725,24 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
|
|||
}
|
||||
|
||||
switch (idx) {
|
||||
/* Loop unrolling: skip if we didn't unroll any loops */
|
||||
/* General TMU sched.: skip if we didn't emit any TMU loads */
|
||||
case 1:
|
||||
case 5:
|
||||
case 6:
|
||||
return !c->has_general_tmu_load;
|
||||
/* Loop unrolling: skip if we didn't unroll any loops */
|
||||
case 2:
|
||||
case 7:
|
||||
return !c->unrolled_any_loops;
|
||||
/* UBO load sorting: skip if we didn't sort any loads */
|
||||
case 2:
|
||||
case 6:
|
||||
case 3:
|
||||
case 8:
|
||||
return !c->sorted_any_ubo_loads;
|
||||
/* TMU pipelining: skip if we didn't pipeline any TMU ops */
|
||||
case 3:
|
||||
case 7:
|
||||
case 4:
|
||||
case 9:
|
||||
return !c->pipelined_any_tmu;
|
||||
/* Lower thread count: skip if we already tried less that 4 threads */
|
||||
case 4:
|
||||
case 5:
|
||||
return c->threads < 4;
|
||||
default:
|
||||
return false;
|
||||
|
|
@ -1780,6 +1795,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
|
|||
strategies[strat].max_threads,
|
||||
strategies[strat].min_threads,
|
||||
strategies[strat].max_tmu_spills,
|
||||
strategies[strat].disable_general_tmu_sched,
|
||||
strategies[strat].disable_loop_unrolling,
|
||||
strategies[strat].disable_ubo_load_sorting,
|
||||
strategies[strat].disable_tmu_pipelining,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue