From 1fb4427a7aeeca7ada1ff57faad69a56da1c53cd Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@collabora.com>
Date: Sun, 27 Feb 2022 15:46:17 -0500
Subject: [PATCH] pan/bi: Imply round mode most of the time

Much less noisy, and provides a path to further improvements. There is a slight
behaviour change: int-to-float conversions now use RTE instead of RTZ. For
32-bit opcodes, this affects conversions of integers with magnitude greater than
2^23 by at most 1 ulp. As this behaviour is unspecified in GLSL, this change is
believed to be acceptable.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15187>
---
 src/panfrost/bifrost/bi_builder.h.py          |  18 ++-
 src/panfrost/bifrost/bifrost_compile.c        | 140 +++++++++---------
 src/panfrost/bifrost/bifrost_isa.py           |   1 +
 .../bifrost/test/test-constant-fold.cpp       |   4 +-
 src/panfrost/bifrost/test/test-optimizer.cpp  | 134 ++++++++---------
 .../test/test-scheduler-predicates.cpp        |   8 +-
 .../bifrost/valhall/test/test-add-imm.cpp     |  40 ++---
 .../bifrost/valhall/test/test-lower-isel.cpp  |   2 +-
 .../bifrost/valhall/test/test-packing.cpp     |  31 ++--
 .../valhall/test/test-validate-fau.cpp        |  40 +++--
 10 files changed, 209 insertions(+), 209 deletions(-)

diff --git a/src/panfrost/bifrost/bi_builder.h.py b/src/panfrost/bifrost/bi_builder.h.py
index 81b78b7177e..5ba37818264 100644
--- a/src/panfrost/bifrost/bi_builder.h.py
+++ b/src/panfrost/bifrost/bi_builder.h.py
@@ -19,7 +19,9 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen", "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem", "not_result", "skip"])
+SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen",
+    "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem",
+    "not_result", "skip", "round"])
 
 TEMPLATE = """
 #ifndef _BI_BUILDER_H_
@@ -99,10 +101,13 @@ bi_instr * bi_${opcode.replace('.', '_').lower()}${to_suffix(ops[opcode])}(${sig
     I->src[${src}] = src${src};
 % endfor
 % for mod in ops[opcode]["modifiers"]:
-% if not should_skip(mod):
+% if not should_skip(mod, opcode):
     I->${mod} = ${mod};
 % endif
 % endfor
+% if ops[opcode]["rtz"]:
+    I->round = BI_ROUND_RTZ;
+% endif
 % for imm in ops[opcode]["immediates"]:
     I->${imm} = ${imm};
 % endfor
@@ -170,11 +175,16 @@ modifier_lists = order_modifiers(ir_instructions)
 
 # Generate type signature for a builder routine
 
-def should_skip(mod):
+def should_skip(mod, op):
+    # FROUND and HADD only make sense in context of a round mode, so override
+    # the usual skip
+    if mod == "round" and ("FROUND" in op or "HADD" in op):
+        return False
+
     return mod in SKIP or mod[0:-1] in SKIP
 
 def modifier_signature(op):
-    return sorted([m for m in op["modifiers"].keys() if not should_skip(m)])
+    return sorted([m for m in op["modifiers"].keys() if not should_skip(m, op["key"])])
 
 def signature(op, modifiers, typeful = False, sized = False, no_dests = False):
     return ", ".join(
diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 15b5807aae5..83507f477a0 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -147,7 +147,7 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
 
                 if (sz == 16) {
                         f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
-                                        bi_imm_f16(128.0), BI_ROUND_NONE);
+                                        bi_imm_f16(128.0));
                 } else {
                         assert(sz == 32);
                         bi_index f[2];
@@ -155,13 +155,13 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
                                 f[i] = bi_fadd_rscale_f32(b,
                                                 bi_word(offset, i),
                                                 bi_imm_f32(0.5), bi_imm_u32(8),
-                                                BI_ROUND_NONE, BI_SPECIAL_NONE);
+                                                BI_SPECIAL_NONE);
                         }
 
-                        f16 = bi_v2f32_to_v2f16(b, f[0], f[1], BI_ROUND_NONE);
+                        f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
                 }
 
-                return bi_v2f16_to_v2s16(b, f16, BI_ROUND_RTZ);
+                return bi_v2f16_to_v2s16(b, f16);
         }
 
         case nir_intrinsic_load_barycentric_pixel:
@@ -1244,7 +1244,7 @@ bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
         for (unsigned i = 0; i < 2; ++i) {
                 src[i] = bi_fadd_f32(b,
                                 bi_u16_to_f32(b, bi_half(bi_register(59), i)),
-                                bi_imm_f32(0.5f), BI_ROUND_NONE);
+                                bi_imm_f32(0.5f));
         }
 
         for (unsigned i = 0; i < 2; ++i) {
@@ -1691,7 +1691,7 @@ bi_nir_round(nir_op op)
 static bi_index
 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
 {
-        return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f), BI_ROUND_NONE);
+        return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
 }
 
 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
@@ -1704,9 +1704,8 @@ bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
         bi_index m  = bi_frexpm_f32(b, s0, false, false);
         bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, false);
         bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
-                        bi_zero(), BI_ROUND_NONE, BI_SPECIAL_N);
-        bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e,
-                        BI_ROUND_NONE, BI_SPECIAL_NONE);
+                        bi_zero(), BI_SPECIAL_N);
+        bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
 }
 
 static void
@@ -1717,9 +1716,8 @@ bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
         bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, true);
         bi_index t1 = bi_fmul_f32(b, x1, x1);
         bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
-                        bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_N);
-        bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e,
-                        BI_ROUND_NONE, BI_SPECIAL_N);
+                        bi_imm_u32(-1), BI_SPECIAL_N);
+        bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
 }
 
 /* More complex transcendentals, see
@@ -1730,26 +1728,23 @@ static void
 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
 {
         bi_index t1 = bi_temp(b->shader);
-        bi_instr *t1_instr = bi_fadd_f32_to(b, t1,
-                        s0, bi_imm_u32(0x49400000), BI_ROUND_NONE);
+        bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
         t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
 
-        bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000), BI_ROUND_NONE);
+        bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
 
-        bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader),
-                        s0, bi_neg(t2), BI_ROUND_NONE);
+        bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
         a2->clamp = BI_CLAMP_CLAMP_M1_1;
 
         bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
         bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
         bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
         bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
-                        bi_imm_u32(0x3e75fffa), BI_ROUND_NONE);
-        bi_index p2 = bi_fma_f32(b, p1, a2->dest[0],
-                        bi_imm_u32(0x3f317218), BI_ROUND_NONE);
+                        bi_imm_u32(0x3e75fffa));
+        bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
         bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
         bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
-                        p3, a1t, a1t, a1i, BI_ROUND_NONE, BI_SPECIAL_NONE);
+                        p3, a1t, a1t, a1i, BI_SPECIAL_NONE);
         x->clamp = BI_CLAMP_CLAMP_0_INF;
 
         bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
@@ -1762,12 +1757,13 @@ bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
         /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
          * fixed-point input */
         bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
-                        bi_imm_u32(24), BI_ROUND_NONE, BI_SPECIAL_NONE);
-        bi_index fixed_pt = bi_f32_to_s32(b, scale, BI_ROUND_NONE);
+                        bi_imm_u32(24), BI_SPECIAL_NONE);
+        bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
+        fixed_pt->round = BI_ROUND_NONE; // XXX
 
         /* Compute the result for the fixed-point input, but pass along
          * the floating-point scale for correct NaN propagation */
-        bi_fexp_f32_to(b, dst, fixed_pt, scale);
+        bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
 }
 
 static void
@@ -1776,7 +1772,7 @@ bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
         /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
         bi_index a1 = bi_frexpm_f32(b, s0, true, false);
         bi_index ei = bi_frexpe_f32(b, s0, true, false);
-        bi_index ef = bi_s32_to_f32(b, ei, BI_ROUND_RTZ);
+        bi_index ef = bi_s32_to_f32(b, ei);
 
         /* xt estimates -log(r1), a coarse approximation of log(a1) */
         bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
@@ -1785,33 +1781,32 @@ bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
         /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
          * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
          * and then log(s0) = x1 + x2 */
-        bi_index x1 = bi_fadd_f32(b, ef, xt, BI_ROUND_NONE);
+        bi_index x1 = bi_fadd_f32(b, ef, xt);
 
         /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
          * polynomial approximation around 1. The series is expressed around
          * 1, so set y = (a1 * r1) - 1.0 */
-        bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0), BI_ROUND_NONE);
+        bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
 
         /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
          * log_e(1 + y) by the Taylor series (lower precision than the blob):
          * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
         bi_index loge = bi_fmul_f32(b, y,
-                bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0), BI_ROUND_NONE));
+                bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
 
         bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
 
         /* log(s0) = x1 + x2 */
-        bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE);
+        bi_fadd_f32_to(b, dst, x1, x2);
 }
 
 static void
 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
 {
         bi_index frexp = bi_frexpe_f32(b, s0, true, false);
-        bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ);
+        bi_index frexpi = bi_s32_to_f32(b, frexp);
         bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
-        bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi,
-                        BI_ROUND_NONE);
+        bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
 }
 
 static void
@@ -1862,12 +1857,11 @@ static void
 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
 {
         /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
-        bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE);
+        bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
 
         /* Approximate domain error (small) */
-        bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS),
-                                BI_ROUND_NONE),
-                        MPI_OVER_TWO, s0, BI_ROUND_NONE);
+        bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
+                        MPI_OVER_TWO, s0);
 
         /* Lookup sin(x), cos(x) */
         bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
@@ -1875,21 +1869,21 @@ bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
 
         /* e^2 / 2 */
         bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
-                        bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE);
+                        bi_imm_u32(-1), BI_SPECIAL_NONE);
 
         /* (-e^2)/2 f''(x) */
         bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
                         cos ? cosx : sinx,
-                        bi_negzero(),  BI_ROUND_NONE);
+                        bi_negzero());
 
         /* e f'(x) - (e^2/2) f''(x) */
         bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
                         cos ? bi_neg(sinx) : cosx,
-                        quadratic, BI_ROUND_NONE);
+                        quadratic);
         I->clamp = BI_CLAMP_CLAMP_M1_1;
 
         /* f(x) + e f'(x) - (e^2/2) f''(x) */
-        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE);
+        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
 }
 
 /* The XOR lane op is useful for derivative calculation, but was added in v7.
@@ -2056,7 +2050,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                 bi_index s1 = comps > 1 ?
                         bi_word(idx, instr->src[0].swizzle[1]) : s0;
 
-                bi_v2f32_to_v2f16_to(b, dst, s0, s1, BI_ROUND_NONE);
+                bi_v2f32_to_v2f16_to(b, dst, s0, s1);
                 return;
 
         /* Vectorized downcasts */
@@ -2095,9 +2089,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                                           bi_half(s1, false));
 
                 if (instr->op == nir_op_u2f16)
-                        bi_v2u16_to_v2f16_to(b, dst, t, BI_ROUND_NONE);
+                        bi_v2u16_to_v2f16_to(b, dst, t);
                 else
-                        bi_v2s16_to_v2f16_to(b, dst, t, BI_ROUND_NONE);
+                        bi_v2s16_to_v2f16_to(b, dst, t);
 
                 return;
         }
@@ -2158,18 +2152,18 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
 
         switch (instr->op) {
         case nir_op_ffma:
-                bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE);
+                bi_fma_to(b, sz, dst, s0, s1, s2);
                 break;
 
         case nir_op_fmul:
-                bi_fma_to(b, sz, dst, s0, s1, bi_negzero(), BI_ROUND_NONE);
+                bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
                 break;
 
         case nir_op_fsub:
                 s1 = bi_neg(s1);
                 FALLTHROUGH;
         case nir_op_fadd:
-                bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE);
+                bi_fadd_to(b, sz, dst, s0, s1);
                 break;
 
         case nir_op_fsat: {
@@ -2245,7 +2239,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                 break;
 
         case nir_op_ldexp:
-                bi_ldexp_to(b, sz, dst, s0, s1, BI_ROUND_NONE);
+                bi_ldexp_to(b, sz, dst, s0, s1);
                 break;
 
         case nir_op_b8csel:
@@ -2290,7 +2284,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
         case nir_op_fddy_must_abs_mali: {
                 bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
                 bi_index adjacent = bi_clper_xor(b, s0, bit);
-                bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0), BI_ROUND_NONE);
+                bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0));
                 break;
         }
 
@@ -2355,7 +2349,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                                         BI_SUBGROUP_SUBGROUP4);
                 }
 
-                bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE);
+                bi_fadd_to(b, sz, dst, right, bi_neg(left));
                 break;
         }
 
@@ -2365,45 +2359,45 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
 
         case nir_op_f2i32:
                 if (src_sz == 32)
-                        bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f32_to_s32_to(b, dst, s0);
                 else
-                        bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f16_to_s32_to(b, dst, s0);
                 break;
 
         /* Note 32-bit sources => no vectorization, so 32-bit works */
         case nir_op_f2u16:
                 if (src_sz == 32)
-                        bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f32_to_u32_to(b, dst, s0);
                 else
-                        bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_v2f16_to_v2u16_to(b, dst, s0);
                 break;
 
         case nir_op_f2i16:
                 if (src_sz == 32)
-                        bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f32_to_s32_to(b, dst, s0);
                 else
-                        bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_v2f16_to_v2s16_to(b, dst, s0);
                 break;
 
         case nir_op_f2u32:
                 if (src_sz == 32)
-                        bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f32_to_u32_to(b, dst, s0);
                 else
-                        bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f16_to_u32_to(b, dst, s0);
                 break;
 
         case nir_op_u2f16:
                 if (src_sz == 32)
-                        bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
+                        bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
                 else if (src_sz == 16)
-                        bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_v2u16_to_v2f16_to(b, dst, s0);
                 else if (src_sz == 8)
                         bi_v2u8_to_v2f16_to(b, dst, s0);
                 break;
 
         case nir_op_u2f32:
                 if (src_sz == 32)
-                        bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_u32_to_f32_to(b, dst, s0);
                 else if (src_sz == 16)
                         bi_u16_to_f32_to(b, dst, s0);
                 else
@@ -2412,9 +2406,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
 
         case nir_op_i2f16:
                 if (src_sz == 32)
-                        bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
+                        bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
                 else if (src_sz == 16)
-                        bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_v2s16_to_v2f16_to(b, dst, s0);
                 else if (src_sz == 8)
                         bi_v2s8_to_v2f16_to(b, dst, s0);
                 break;
@@ -2423,7 +2417,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
 
                 if (src_sz == 32)
-                        bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_s32_to_f32_to(b, dst, s0);
                 else if (src_sz == 16)
                         bi_s16_to_f32_to(b, dst, s0);
                 else if (src_sz == 8)
@@ -2732,7 +2726,9 @@ bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
          * 0, dt - 1). So we use round RTE, clamping is handled at the data
          * structure level */
 
-        return bi_f32_to_u32(b, idx, BI_ROUND_NONE);
+        bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
+        I->round = BI_ROUND_NONE;
+        return I->dest[0];
 }
 
 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
@@ -2760,16 +2756,15 @@ bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
 
         bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
                         fp16 ? bi_half(lod, false) : lod,
-                        bi_imm_f32(1.0f / max_lod), bi_negzero(), BI_ROUND_NONE);
+                        bi_imm_f32(1.0f / max_lod), bi_negzero());
 
         fsat->clamp = BI_CLAMP_CLAMP_M1_1;
 
         bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
-                        bi_negzero(), BI_ROUND_NONE);
+                        bi_negzero());
 
         return bi_mkvec_v2i16(b,
-                        bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false),
-                        bi_imm_u16(0));
+                        bi_half(bi_f32_to_s32(b, fmul), false), bi_imm_u16(0));
 }
 
 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
@@ -2911,17 +2906,14 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord,
         bi_index rcp = bi_frcp_f32(b, maxxyz);
 
         /* Calculate 0.5 * (1.0 / max{x, y, z}) */
-        bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero(),
-                        BI_ROUND_NONE);
+        bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
 
         /* Transform the coordinates */
         *s = bi_temp(b->shader);
         *t = bi_temp(b->shader);
 
-        bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f),
-                        BI_ROUND_NONE);
-        bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f),
-                        BI_ROUND_NONE);
+        bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
+        bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
 
         S->clamp = BI_CLAMP_CLAMP_0_1;
         T->clamp = BI_CLAMP_CLAMP_0_1;
diff --git a/src/panfrost/bifrost/bifrost_isa.py b/src/panfrost/bifrost/bifrost_isa.py
index ba5e62aba48..f2626140708 100644
--- a/src/panfrost/bifrost/bifrost_isa.py
+++ b/src/panfrost/bifrost/bifrost_isa.py
@@ -278,6 +278,7 @@ def combine_ir_variants(instructions, key):
     # Great, we've checked srcs/immediates are consistent and we've summed over
     # modifiers
     return {
+            'key': key,
             'srcs': variants[0]['srcs'],
             'dests': variants[0]['dests'],
             'staging': variants[0]['staging'],
diff --git a/src/panfrost/bifrost/test/test-constant-fold.cpp b/src/panfrost/bifrost/test/test-constant-fold.cpp
index 660e7793d1a..d28fc953ae8 100644
--- a/src/panfrost/bifrost/test/test-constant-fold.cpp
+++ b/src/panfrost/bifrost/test/test-constant-fold.cpp
@@ -186,6 +186,6 @@ TEST_F(ConstantFold, OtherOperationsShouldNotFold)
    bi_index zero = bi_fau(bir_fau(BIR_FAU_IMMEDIATE | 0), false);
    bi_index reg = bi_register(0);
 
-   EXPECT_NOT_FOLD(bi_fma_f32_to(b, reg, zero, zero, zero, BI_ROUND_NONE));
-   EXPECT_NOT_FOLD(bi_fadd_f32_to(b, reg, zero, zero, BI_ROUND_NONE));
+   EXPECT_NOT_FOLD(bi_fma_f32_to(b, reg, zero, zero, zero));
+   EXPECT_NOT_FOLD(bi_fadd_f32_to(b, reg, zero, zero));
 }
diff --git a/src/panfrost/bifrost/test/test-optimizer.cpp b/src/panfrost/bifrost/test/test-optimizer.cpp
index 80b81597786..50593ea25f0 100644
--- a/src/panfrost/bifrost/test/test-optimizer.cpp
+++ b/src/panfrost/bifrost/test/test-optimizer.cpp
@@ -63,17 +63,17 @@ protected:
 
 TEST_F(Optimizer, FusedFABSNEG)
 {
-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_abs(x), y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), y),
+        bi_fadd_f32_to(b, reg, bi_abs(x), y));
 
-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_neg(x)), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_neg(x), y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_neg(x)), y),
+        bi_fadd_f32_to(b, reg, bi_neg(x), y));
 
-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, negabsx), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, negabsx, y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, negabsx), y),
+        bi_fadd_f32_to(b, reg, negabsx, y));
 
-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, x), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, x), y),
+        bi_fadd_f32_to(b, reg, x, y));
 
    CASE(bi_fmin_f32_to(b, reg, bi_fabsneg_f32(b, negabsx), bi_neg(y)),
         bi_fmin_f32_to(b, reg, negabsx, bi_neg(y)));
@@ -81,8 +81,8 @@ TEST_F(Optimizer, FusedFABSNEG)
 
 TEST_F(Optimizer, FusedFABSNEGForFP16)
 {
-   CASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, negabsx), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, negabsx, y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, negabsx), y),
+        bi_fadd_v2f16_to(b, reg, negabsx, y));
 
    CASE(bi_fmin_v2f16_to(b, reg, bi_fabsneg_v2f16(b, negabsx), bi_neg(y)),
         bi_fmin_v2f16_to(b, reg, negabsx, bi_neg(y)));
@@ -91,26 +91,26 @@ TEST_F(Optimizer, FusedFABSNEGForFP16)
 TEST_F(Optimizer, FuseFADD_F32WithEqualSourcesAbsAbsAndClamp)
 {
    CASE({
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), bi_abs(x));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    }, {
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    CASE({
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_fabsneg_f32(b, bi_abs(x)), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_fabsneg_f32(b, bi_abs(x)));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    }, {
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    CASE({
-         bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), bi_abs(x), BI_ROUND_NONE));
+         bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), bi_abs(x)));
          I->clamp = BI_CLAMP_CLAMP_0_INF;
    }, {
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x));
          I->clamp = BI_CLAMP_CLAMP_0_INF;
    });
 }
@@ -118,26 +118,26 @@ TEST_F(Optimizer, FuseFADD_F32WithEqualSourcesAbsAbsAndClamp)
 TEST_F(Optimizer, FuseFADD_V2F16WithDifferentSourcesAbsAbsAndClamp)
 {
    CASE({
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(y), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(y));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    }, {
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    CASE({
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(y)), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(y)));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    }, {
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    CASE({
-         bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(y), BI_ROUND_NONE));
+         bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(y)));
          I->clamp = BI_CLAMP_CLAMP_0_INF;
    }, {
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y));
          I->clamp = BI_CLAMP_CLAMP_0_INF;
    });
 }
@@ -145,57 +145,57 @@ TEST_F(Optimizer, FuseFADD_V2F16WithDifferentSourcesAbsAbsAndClamp)
 TEST_F(Optimizer, AvoidFADD_V2F16WithEqualSourcesAbsAbsAndClamp)
 {
    NEGCASE({
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(x));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    NEGCASE({
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(x)), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(x)));
          I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    NEGCASE({
-      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(x), BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(x)));
       I->clamp = BI_CLAMP_CLAMP_0_INF;
    });
 }
 
 TEST_F(Optimizer, SwizzlesComposedForFP16)
 {
-   CASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), y),
+        bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y));
 
-   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, negabsx), true, false), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, negabsx), true, false), y),
+        bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y));
 
-   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), true, false), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, negabsx, y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), true, false), y),
+        bi_fadd_v2f16_to(b, reg, negabsx, y));
 
-   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, false)), true, false), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, bi_half(negabsx, false), y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, false)), true, false), y),
+        bi_fadd_v2f16_to(b, reg, bi_half(negabsx, false), y));
 
-   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, true)), true, false), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, bi_half(negabsx, true), y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, true)), true, false), y),
+        bi_fadd_v2f16_to(b, reg, bi_half(negabsx, true), y));
 }
 
 TEST_F(Optimizer, PreserveWidens)
 {
    /* Check that widens are passed through */
-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(negabsx, false)), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_half(negabsx, false), y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(negabsx, false)), y),
+        bi_fadd_f32_to(b, reg, bi_half(negabsx, false), y));
 
-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(negabsx, true)), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_half(negabsx, true), y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(negabsx, true)), y),
+        bi_fadd_f32_to(b, reg, bi_half(negabsx, true), y));
 
-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(x, true)), bi_fabsneg_f32(b, bi_half(x, false)), BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_half(x, true), bi_half(x, false), BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(x, true)), bi_fabsneg_f32(b, bi_half(x, false))),
+        bi_fadd_f32_to(b, reg, bi_half(x, true), bi_half(x, false)));
 }
 
 TEST_F(Optimizer, DoNotMixSizesForFABSNEG)
 {
    /* Refuse to mix sizes for fabsneg, that's wrong */
-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fabsneg_v2f16(b, negabsx), y, BI_ROUND_NONE));
-   NEGCASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_f32(b, negabsx), y, BI_ROUND_NONE));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fabsneg_v2f16(b, negabsx), y));
+   NEGCASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_f32(b, negabsx), y));
 }
 
 TEST_F(Optimizer, AvoidZeroAndFABSNEGFootguns)
@@ -206,27 +206,27 @@ TEST_F(Optimizer, AvoidZeroAndFABSNEGFootguns)
 
    bi_index zero = bi_zero();
 
-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), zero, BI_ROUND_NONE), y, BI_ROUND_NONE));
-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_neg(x), zero, BI_ROUND_NONE), y, BI_ROUND_NONE));
-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_neg(bi_abs(x)), zero, BI_ROUND_NONE), y, BI_ROUND_NONE));
-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, x, zero, BI_ROUND_NONE), y, BI_ROUND_NONE));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), zero), y));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_neg(x), zero), y));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_neg(bi_abs(x)), zero), y));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, x, zero), y));
 }
 
 TEST_F(Optimizer, ClampsPropagated)
 {
    CASE({
-      bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, x, y, BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, x, y));
       I->clamp = BI_CLAMP_CLAMP_0_INF;
    }, {
-      bi_instr *I = bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, x, y);
       I->clamp = BI_CLAMP_CLAMP_0_INF;
    });
 
    CASE({
-      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y, BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y));
       I->clamp = BI_CLAMP_CLAMP_0_1;
    }, {
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y);
       I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 }
@@ -235,62 +235,62 @@ TEST_F(Optimizer, ClampsPropagated)
 TEST_F(Optimizer, ClampsComposed)
 {
    CASE({
-      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y);
       bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]);
       I->clamp = BI_CLAMP_CLAMP_M1_1;
       J->clamp = BI_CLAMP_CLAMP_0_INF;
    }, {
-      bi_instr *I = bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, x, y);
       I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    CASE({
-      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y);
       bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]);
       I->clamp = BI_CLAMP_CLAMP_0_1;
       J->clamp = BI_CLAMP_CLAMP_0_INF;
    }, {
-      bi_instr *I = bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, x, y);
       I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    CASE({
-      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y);
       bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]);
       I->clamp = BI_CLAMP_CLAMP_0_INF;
       J->clamp = BI_CLAMP_CLAMP_0_INF;
    }, {
-      bi_instr *I = bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, x, y);
       I->clamp = BI_CLAMP_CLAMP_0_INF;
    });
 
    CASE({
-      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y);
       bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]);
       I->clamp = BI_CLAMP_CLAMP_M1_1;
       J->clamp = BI_CLAMP_CLAMP_0_INF;
    }, {
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y);
       I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    CASE({
-      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y);
       bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]);
       I->clamp = BI_CLAMP_CLAMP_0_1;
       J->clamp = BI_CLAMP_CLAMP_0_INF;
    }, {
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y);
       I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    CASE({
-      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y);
       bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]);
       I->clamp = BI_CLAMP_CLAMP_0_INF;
       J->clamp = BI_CLAMP_CLAMP_0_INF;
    }, {
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y);
       I->clamp = BI_CLAMP_CLAMP_0_INF;
    });
 }
@@ -298,12 +298,12 @@ TEST_F(Optimizer, ClampsComposed)
 TEST_F(Optimizer, DoNotMixSizesWhenClamping)
 {
    NEGCASE({
-      bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_v2f16(b, x, y, BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_v2f16(b, x, y));
       I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 
    NEGCASE({
-      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_f32(b, x, y, BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_f32(b, x, y));
       I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 }
@@ -314,12 +314,12 @@ TEST_F(Optimizer, DoNotUseAdditionByZeroForClamps)
 
    /* We can't use addition by 0.0 for clamps due to signed zeros. */
    NEGCASE({
-      bi_instr *I = bi_fadd_f32_to(b, reg, bi_fadd_f32(b, x, y, BI_ROUND_NONE), zero, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, bi_fadd_f32(b, x, y), zero);
       I->clamp = BI_CLAMP_CLAMP_M1_1;
    });
 
    NEGCASE({
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y, BI_ROUND_NONE), zero, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y), zero);
       I->clamp = BI_CLAMP_CLAMP_0_1;
    });
 }
diff --git a/src/panfrost/bifrost/test/test-scheduler-predicates.cpp b/src/panfrost/bifrost/test/test-scheduler-predicates.cpp
index c00402ea7e1..65d700fbfa6 100644
--- a/src/panfrost/bifrost/test/test-scheduler-predicates.cpp
+++ b/src/panfrost/bifrost/test/test-scheduler-predicates.cpp
@@ -56,7 +56,7 @@ TEST_F(SchedulerPredicates, MOV)
 
 TEST_F(SchedulerPredicates, FMA)
 {
-   bi_instr *fma = bi_fma_f32_to(b, TMP(), TMP(), TMP(), bi_zero(), BI_ROUND_NONE);
+   bi_instr *fma = bi_fma_f32_to(b, TMP(), TMP(), TMP(), bi_zero());
    ASSERT_TRUE(bi_can_fma(fma));
    ASSERT_FALSE(bi_can_add(fma));
    ASSERT_FALSE(bi_must_message(fma));
@@ -96,12 +96,12 @@ TEST_F(SchedulerPredicates, BLEND)
 
 TEST_F(SchedulerPredicates, RestrictionsOnModifiersOfSameCycleTemporaries)
 {
-   bi_instr *fadd = bi_fadd_f32_to(b, TMP(), TMP(), TMP(), BI_ROUND_NONE);
+   bi_instr *fadd = bi_fadd_f32_to(b, TMP(), TMP(), TMP());
    ASSERT_TRUE(bi_reads_t(fadd, 0));
 
    for (unsigned i = 0; i < 2; ++i) {
       for (unsigned j = 0; j < 2; ++j) {
-         bi_instr *fadd = bi_fadd_f32_to(b, TMP(), TMP(), TMP(), BI_ROUND_NONE);
+         bi_instr *fadd = bi_fadd_f32_to(b, TMP(), TMP(), TMP());
          fadd->src[i] = bi_swz_16(TMP(), j, j);
          ASSERT_TRUE(bi_reads_t(fadd, 1 - i));
          ASSERT_FALSE(bi_reads_t(fadd, i));
@@ -115,7 +115,7 @@ TEST_F(SchedulerPredicates, RestrictionsOnFAddV2F16)
    bi_index y = bi_register(1);
 
    /* Basic */
-   bi_instr *fadd = bi_fadd_v2f16_to(b, TMP(), x, x, BI_ROUND_NONE);
+   bi_instr *fadd = bi_fadd_v2f16_to(b, TMP(), x, x);
 
    ASSERT_TRUE(bi_can_fma(fadd));
    ASSERT_TRUE(bi_can_add(fadd));
diff --git a/src/panfrost/bifrost/valhall/test/test-add-imm.cpp b/src/panfrost/bifrost/valhall/test/test-add-imm.cpp
index 7004bbbd564..53b394b8b59 100644
--- a/src/panfrost/bifrost/valhall/test/test-add-imm.cpp
+++ b/src/panfrost/bifrost/valhall/test/test-add-imm.cpp
@@ -60,45 +60,37 @@ TEST_F(AddImm, Basic) {
    CASE(bi_mov_i32_to(b, bi_register(63), bi_imm_u32(0xABAD1DEA)),
         bi_iadd_imm_i32_to(b, bi_register(63), bi_zero(), 0xABAD1DEA));
 
-   CASE(bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0)),
         bi_fadd_imm_f32_to(b, bi_register(1), bi_register(2), fui(42.0)));
 
-   CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_imm_f32(42.0), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_imm_f32(42.0)),
         bi_fadd_imm_f32_to(b, bi_register(1), bi_discard(bi_register(2)), fui(42.0)));
 
-   CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_neg(bi_imm_f32(42.0)), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_neg(bi_imm_f32(42.0))),
         bi_fadd_imm_f32_to(b, bi_register(1), bi_discard(bi_register(2)), fui(-42.0)));
 }
 
 TEST_F(AddImm, Commutativty) {
-   CASE(bi_fadd_f32_to(b, bi_register(1), bi_imm_f32(42.0), bi_register(2), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(1), bi_imm_f32(42.0), bi_register(2)),
         bi_fadd_imm_f32_to(b, bi_register(1), bi_register(2), fui(42.0)));
 }
 
 TEST_F(AddImm, NoModifiers) {
-   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0),
-            BI_ROUND_RTP));
-
-   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_abs(bi_register(2)), bi_imm_f32(42.0),
-            BI_ROUND_NONE));
-
-   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_neg(bi_register(2)), bi_imm_f32(42.0),
-            BI_ROUND_NONE));
-
-   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_f32(42.0),
-            BI_ROUND_NONE));
+   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_abs(bi_register(2)), bi_imm_f32(42.0)));
+   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_neg(bi_register(2)), bi_imm_f32(42.0)));
+   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_f32(42.0)));
 }
 
 TEST_F(AddImm, NoClamp) {
    NEGCASE({
       bi_instr *I = bi_fadd_f32_to(b, bi_register(1), bi_register(2),
-            bi_imm_f32(42.0), BI_ROUND_NONE);
+            bi_imm_f32(42.0));
       I->clamp = BI_CLAMP_CLAMP_M1_1;
    });
 }
 
 TEST_F(AddImm, OtherTypes) {
-   CASE(bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0), BI_ROUND_NONE),
+   CASE(bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0)),
         bi_fadd_imm_v2f16_to(b, bi_register(1), bi_register(2), 0x51405140));
 
    CASE(bi_iadd_u32_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false),
@@ -119,7 +111,6 @@ TEST_F(AddImm, OtherTypes) {
    CASE(bi_iadd_v4s8_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false),
         bi_iadd_imm_v4i8_to(b, bi_register(1), bi_register(2), 0xDEADBEEF));
 
-   NEGCASE(bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0), BI_ROUND_RTZ));
    NEGCASE(bi_iadd_u32_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_u32(0xDEADBEEF), false));
    NEGCASE(bi_iadd_v2u16_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_u32(0xDEADBEEF), false));
    NEGCASE(bi_iadd_u32_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), true));
@@ -135,3 +126,16 @@ TEST_F(AddImm, Int8) {
    NEGCASE(bi_iadd_v4u8_to(b, bi_register(1), idx, bi_imm_u32(0xDEADBEEF), false));
    NEGCASE(bi_iadd_v4s8_to(b, bi_register(1), idx, bi_imm_u32(0xDEADBEEF), false));
 }
+
+TEST_F(AddImm, OnlyRTE) {
+   NEGCASE({
+         bi_instr *I = bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0));
+         I->round = BI_ROUND_RTP;
+   });
+
+   NEGCASE({
+         bi_instr *I = bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0));
+         I->round = BI_ROUND_RTZ;
+   });
+}
+
diff --git a/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp b/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp
index f142e116d2c..de6994b6c4c 100644
--- a/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp
+++ b/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp
@@ -102,7 +102,7 @@ TEST_F(LowerIsel, IntegerCSEL) {
 }
 
 TEST_F(LowerIsel, Smoke) {
-   NEGCASE(bi_fadd_f32_to(b, reg, reg, reg, BI_ROUND_RTP));
+   NEGCASE(bi_fadd_f32_to(b, reg, reg, reg));
    NEGCASE(bi_csel_s32_to(b, reg, reg, reg, reg, reg, BI_CMPF_LT));
    NEGCASE(bi_csel_u32_to(b, reg, reg, reg, reg, reg, BI_CMPF_LT));
 }
diff --git a/src/panfrost/bifrost/valhall/test/test-packing.cpp b/src/panfrost/bifrost/valhall/test/test-packing.cpp
index 34ed4e56924..ce46da36422 100644
--- a/src/panfrost/bifrost/valhall/test/test-packing.cpp
+++ b/src/panfrost/bifrost/valhall/test/test-packing.cpp
@@ -65,42 +65,42 @@ TEST_F(ValhallPacking, Moves) {
 }
 
 TEST_F(ValhallPacking, Fadd) {
-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2)),
          0x00a4c00000000201ULL);
-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2)), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))),
          0x00a4c02000000201ULL);
-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2)), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))),
          0x00a4c01000000201ULL);
 
    CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_swz_16(bi_register(1), false, false),
-                         bi_swz_16(bi_register(0), true, true), BI_ROUND_NONE),
+                         bi_swz_16(bi_register(0), true, true)),
          0x00a5c0000c000001ULL);
 
-   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0), BI_ROUND_NONE),
+   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0)),
          0x00a5c00028000001ULL);
 
    CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1),
-                         bi_swz_16(bi_register(0), true, false), BI_ROUND_NONE),
+                         bi_swz_16(bi_register(0), true, false)),
          0x00a5c00024000001ULL);
 
    CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))),
-                         bi_neg(zero), BI_ROUND_NONE),
+                         bi_neg(zero)),
          0x00a5c0902800c040ULL);
 
    CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       zero, BI_ROUND_NONE),
+                       zero),
          0x00a4c0000000c001ULL);
 
    CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       bi_neg(zero), BI_ROUND_NONE),
+                       bi_neg(zero)),
          0x00a4c0100000c001ULL);
 
    CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       bi_half(bi_register(0), true), BI_ROUND_NONE),
+                       bi_half(bi_register(0), true)),
          0x00a4c00008000001ULL);
 
    CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       bi_half(bi_register(0), false), BI_ROUND_NONE),
+                       bi_half(bi_register(0), false)),
          0x00a4c00004000001ULL);
 }
 
@@ -112,8 +112,7 @@ TEST_F(ValhallPacking, Clper) {
 
 TEST_F(ValhallPacking, Clamps) {
    bi_instr *I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-         bi_neg(bi_abs(bi_register(2))),
-         BI_ROUND_NONE);
+                                bi_neg(bi_abs(bi_register(2))));
    CASE(I, 0x00a4c03000000201ULL);
 
    I->clamp = BI_CLAMP_CLAMP_M1_1;
@@ -123,7 +122,7 @@ TEST_F(ValhallPacking, Clamps) {
 TEST_F(ValhallPacking, Misc) {
    CASE(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)),
                          bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 4), false),
-                         bi_neg(zero), BI_ROUND_NONE),
+                         bi_neg(zero)),
          0x00b2c10400c08841ULL);
 
    CASE(bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))),
@@ -164,7 +163,7 @@ TEST_F(ValhallPacking, Comparions) {
 }
 
 TEST_F(ValhallPacking, Conversions) {
-   CASE(bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)), BI_ROUND_NONE),
+   CASE(bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2))),
          0x0090c22000070042);
 }
 
@@ -219,7 +218,7 @@ TEST_F(ValhallPacking, Transcendentals) {
    CASE(bi_frsq_f32_to(b, bi_register(2), bi_register(1)),
         0x009cc20000020001);
 
-   CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)), bi_discard(bi_register(2)), bi_neg(zero), bi_discard(bi_register(0)), BI_ROUND_NONE, BI_SPECIAL_LEFT),
+   CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)), bi_discard(bi_register(2)), bi_neg(zero), bi_discard(bi_register(0)), BI_SPECIAL_LEFT),
         0x0162c00440c04241);
 }
 
diff --git a/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp b/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp
index 4275d0359fd..4aae6e027b3 100644
--- a/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp
+++ b/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp
@@ -66,53 +66,47 @@ protected:
 TEST_F(ValidateFau, One64BitUniformSlot)
 {
    VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(3),
-            unif, BI_ROUND_NONE));
+            unif));
    VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_word(unif, 1),
-            unif, BI_ROUND_NONE));
-   VALID(bi_fma_f32_to(b, bi_register(1), unif, unif, bi_word(unif, 1),
-            BI_ROUND_NONE));
-   INVALID(bi_fma_f32_to(b, bi_register(1), unif, unif2, bi_register(1),
-            BI_ROUND_NONE));
-   INVALID(bi_fma_f32_to(b, bi_register(1), unif, unif2, bi_word(unif, 1),
-            BI_ROUND_NONE));
+            unif));
+   VALID(bi_fma_f32_to(b, bi_register(1), unif, unif, bi_word(unif, 1)));
+   INVALID(bi_fma_f32_to(b, bi_register(1), unif, unif2, bi_register(1)));
+   INVALID(bi_fma_f32_to(b, bi_register(1), unif, unif2, bi_word(unif, 1)));
 
    /* Crafted case that appears correct at first glance and was erronously
     * marked as valid in early versions of the validator.
     */
    INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2),
                          bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 0), false),
-                         bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 1), true),
-                         BI_ROUND_NONE));
+                         bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 1), true)));
 }
 
 TEST_F(ValidateFau, Combined64BitUniformsConstants)
 {
    VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_word(unif, 1),
-            unif, BI_ROUND_NONE));
-   VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), zero,
-            unif, BI_ROUND_NONE));
-   VALID(bi_fma_f32_to(b, bi_register(1), zero, imm1, imm1, BI_ROUND_NONE));
-   INVALID(bi_fma_f32_to(b, bi_register(1), zero, bi_word(unif, 1),
-            unif, BI_ROUND_NONE));
-   INVALID(bi_fma_f32_to(b, bi_register(1), zero, imm1, imm2, BI_ROUND_NONE));
+            unif));
+   VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), zero, unif));
+   VALID(bi_fma_f32_to(b, bi_register(1), zero, imm1, imm1));
+   INVALID(bi_fma_f32_to(b, bi_register(1), zero, bi_word(unif, 1), unif));
+   INVALID(bi_fma_f32_to(b, bi_register(1), zero, imm1, imm2));
 }
 
 TEST_F(ValidateFau, UniformsOnlyInDefaultMode)
 {
    INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_word(unif, 1),
-            lane_id, BI_ROUND_NONE));
+            lane_id));
    INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_word(unif, 1),
-            core_id, BI_ROUND_NONE));
+            core_id));
 }
 
 TEST_F(ValidateFau, SingleSpecialImmediate)
 {
    VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(2),
-            lane_id, BI_ROUND_NONE));
+            lane_id));
    VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(2),
-            core_id, BI_ROUND_NONE));
+            core_id));
    INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), lane_id,
-            core_id, BI_ROUND_NONE));
+            core_id));
 }
 
 TEST_F(ValidateFau, SmokeTests)
@@ -120,5 +114,5 @@ TEST_F(ValidateFau, SmokeTests)
    VALID(bi_mov_i32_to(b, bi_register(1), bi_register(2)));
    VALID(bi_mov_i32_to(b, bi_register(1), unif));
    VALID(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)),
-                        unif, bi_neg(zero), BI_ROUND_NONE));
+                        unif, bi_neg(zero)));
 }