From 82981ccbb1d1bef20b1dfdc93f3ed6d98b8c708a Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Thu, 11 Feb 2021 12:28:52 +0100 Subject: [PATCH] broadcom/compiler: use unifa for UBO loads from uniform addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This basically processes UBO loads as uniform loads by writing the load address to the unifa register and reading sequential values with ldunifa. This process is faster than going through the TMU, but we can only use it when the address we are reading from is uniform across all channels, since we are basically reading from the UBO address as if it was a uniform stream. This leads to better performance in the UE4 Shooter demo. Reviewed-by: Alejandro PiƱeiro Part-of: --- src/broadcom/compiler/nir_to_vir.c | 41 +++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 7611bd00be5..30b018de23f 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -2564,6 +2564,42 @@ ntq_emit_load_interpolated_input(struct v3d_compile *c, return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C); } +static void +ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + bool dynamic_src = !nir_src_is_const(instr->src[1]); + uint32_t const_offset = + dynamic_src ? 0 : nir_src_as_uint(instr->src[1]); + + /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index + * shifted up by 1 (0 is gallium's constant buffer 0). + */ + uint32_t index = nir_src_as_uint(instr->src[0]); + if (c->key->environment == V3D_ENVIRONMENT_OPENGL) + index++; + + struct qreg base_offset = + vir_uniform(c, QUNIFORM_UBO_ADDR, + v3d_unit_data_create(index, const_offset)); + const_offset = 0; + + struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); + if (!dynamic_src) { + vir_MOV_dest(c, unifa, base_offset); + } else { + vir_ADD_dest(c, unifa, base_offset, + ntq_get_src(c, instr->src[1], 0)); + } + + for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) { + struct qinst *ldunifa = + vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef); + ldunifa->qpu.sig.ldunifa = true; + struct qreg data = vir_emit_def(c, ldunifa); + ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data)); + } +} + static void ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) { @@ -2573,7 +2609,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_ubo: - ntq_emit_tmu_general(c, instr, false); + if (!nir_src_is_divergent(instr->src[1])) + ntq_emit_load_ubo_unifa(c, instr); + else + ntq_emit_tmu_general(c, instr, false); break; case nir_intrinsic_ssbo_atomic_add: