diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 3de40e050db..d95ac010a21 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -1146,6 +1146,13 @@ v3d_instr_delay_cb(nir_instr *instr, void *data) case nir_instr_type_phi: return 1; + /* We should not use very large delays for TMU instructions. Typically, + * thread switches will be sufficient to hide all or most of the latency, + * so we typically only need a little bit of extra room. If we over-estimate + * the latency here we may end up unnecesarily delaying the critical path in + * the shader, which would have a negative effect in performance, so here + * we are trying to strike a balance based on empirical testing. + */ case nir_instr_type_intrinsic: { if (!c->disable_general_tmu_sched) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); @@ -1154,10 +1161,10 @@ v3d_instr_delay_cb(nir_instr *instr, void *data) case nir_intrinsic_load_scratch: case nir_intrinsic_load_shared: case nir_intrinsic_image_load: - return 30; + return 3; case nir_intrinsic_load_ubo: if (nir_src_is_divergent(intr->src[1])) - return 30; + return 3; FALLTHROUGH; default: return 1; @@ -1169,7 +1176,7 @@ v3d_instr_delay_cb(nir_instr *instr, void *data) } case nir_instr_type_tex: - return 50; + return 5; } return 0;