From 9bf525b4bd733c4bd88e39496eaae4d891d07184 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Mon, 16 Jan 2023 09:04:10 +0100 Subject: [PATCH] broadcom/compiler: produce better code for f2f16 with RTZ rounding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suggested by Georg Lehmann, this generates far less code and should be more correct. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/8090 Reviewed-by: Alejandro PiƱeiro Part-of: --- src/broadcom/compiler/nir_to_vir.c | 94 +++++------------------------- 1 file changed, 16 insertions(+), 78 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 6ff9b82d73d..6505fe5cf77 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1299,88 +1299,26 @@ ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond) static struct qreg f2f16_rtz(struct v3d_compile *c, struct qreg f32) { - /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding - * method and seems to be using RTE by default, so we need to implement - * RTZ rounding in software :-( - * - * The implementation identifies the cases where RTZ applies and - * returns the correct result and for everything else, it just uses - * the default RTE conversion. - */ - static bool _first = true; - if (_first && V3D_DBG(PERF)) { - fprintf(stderr, "Shader uses round-toward-zero f32->f16 " - "conversion which is not supported in hardware.\n"); - _first = false; - } + /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding + * method and seems to be using RTE by default, so we need to implement + * RTZ rounding in software. + */ + struct qreg rf16 = vir_FMOV(c, f32); + vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L); - struct qinst *inst; - struct qreg tmp; + struct qreg rf32 = vir_FMOV(c, rf16); + vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L); - struct qreg result = vir_get_temp(c); + struct qreg f32_abs = vir_FMOV(c, f32); + vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS); - struct qreg mantissa32 = vir_AND(c, f32, vir_uniform_ui(c, 0x007fffff)); + struct qreg rf32_abs = vir_FMOV(c, rf32); + vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS); - /* Compute sign bit of result */ - struct qreg sign = vir_AND(c, vir_SHR(c, f32, vir_uniform_ui(c, 16)), - vir_uniform_ui(c, 0x8000)); - - /* Check the cases were RTZ rounding is relevant based on exponent */ - struct qreg exp32 = vir_AND(c, vir_SHR(c, f32, vir_uniform_ui(c, 23)), - vir_uniform_ui(c, 0xff)); - struct qreg exp16 = vir_ADD(c, exp32, vir_uniform_ui(c, -127 + 15)); - - /* if (exp16 > 30) */ - inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, 30)); - vir_set_pf(c, inst, V3D_QPU_PF_PUSHC); - inst = vir_OR_dest(c, result, sign, vir_uniform_ui(c, 0x7bff)); - vir_set_cond(inst, V3D_QPU_COND_IFA); - - /* if (exp16 <= 30) */ - inst = vir_OR_dest(c, result, - vir_OR(c, sign, - vir_SHL(c, exp16, vir_uniform_ui(c, 10))), - vir_SHR(c, mantissa32, vir_uniform_ui(c, 13))); - vir_set_cond(inst, V3D_QPU_COND_IFNA); - - /* if (exp16 <= 0) */ - inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, 0)); - vir_set_pf(c, inst, V3D_QPU_PF_PUSHC); - - tmp = vir_OR(c, mantissa32, vir_uniform_ui(c, 0x800000)); - tmp = vir_SHR(c, tmp, vir_SUB(c, vir_uniform_ui(c, 14), exp16)); - inst = vir_OR_dest(c, result, sign, tmp); - vir_set_cond(inst, V3D_QPU_COND_IFNA); - - /* Cases where RTZ mode is not relevant: use default RTE conversion. - * - * The cases that are not affected by RTZ are: - * - * exp16 < - 10 || exp32 == 0 || exp32 == 0xff - * - * In V3D we can implement this condition as: - * - * !((exp16 >= -10) && !(exp32 == 0) && !(exp32 == 0xff))) - */ - - /* exp16 >= -10 */ - inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, -10)); - vir_set_pf(c, inst, V3D_QPU_PF_PUSHC); - - /* && !(exp32 == 0) */ - inst = vir_MOV_dest(c, vir_nop_reg(), exp32); - vir_set_uf(c, inst, V3D_QPU_UF_ANDNZ); - - /* && !(exp32 == 0xff) */ - inst = vir_XOR_dest(c, vir_nop_reg(), exp32, vir_uniform_ui(c, 0xff)); - vir_set_uf(c, inst, V3D_QPU_UF_ANDNZ); - - /* Use regular RTE conversion if condition is False */ - inst = vir_FMOV_dest(c, result, f32); - vir_set_pack(inst, V3D_QPU_PACK_L); - vir_set_cond(inst, V3D_QPU_COND_IFNA); - - return vir_MOV(c, result); + vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs), + V3D_QPU_PF_PUSHN); + return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, + vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16)); } /**