From 871a7536e84a9cefc6db943c19c0056c4f1320eb Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 8 Jul 2022 13:37:02 +0200 Subject: [PATCH] broadcom/compiler: don't over-estimate latency of TMU instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Over-estimating latency can cause us to delay the critical paths of the shader unnecessarily, producing larger QPU programs that take more time to execute as a result (and it also adds register pressure) so striking a balance is important. The thread switching model in V3D is quite effective at hiding latency and usuallly we just need to hint it to delay TMU instructions a little bit to find the best compromise for performance. The new latency numbers have been chosen empirically by testing V3DV with Sponza and a few UE4 samples. Reviewed-by: Alejandro PiƱeiro Part-of: --- src/broadcom/compiler/vir.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 3de40e050db..d95ac010a21 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -1146,6 +1146,13 @@ v3d_instr_delay_cb(nir_instr *instr, void *data) case nir_instr_type_phi: return 1; + /* We should not use very large delays for TMU instructions. Typically, + * thread switches will be sufficient to hide all or most of the latency, + * so we typically only need a little bit of extra room. If we over-estimate + * the latency here we may end up unnecesarily delaying the critical path in + * the shader, which would have a negative effect in performance, so here + * we are trying to strike a balance based on empirical testing. + */ case nir_instr_type_intrinsic: { if (!c->disable_general_tmu_sched) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); @@ -1154,10 +1161,10 @@ v3d_instr_delay_cb(nir_instr *instr, void *data) case nir_intrinsic_load_scratch: case nir_intrinsic_load_shared: case nir_intrinsic_image_load: - return 30; + return 3; case nir_intrinsic_load_ubo: if (nir_src_is_divergent(intr->src[1])) - return 30; + return 3; FALLTHROUGH; default: return 1; @@ -1169,7 +1176,7 @@ v3d_instr_delay_cb(nir_instr *instr, void *data) } case nir_instr_type_tex: - return 50; + return 5; } return 0;