pan/bi: Imply round mode most of the time

Much less noisy, and provides a path to further improvements. There is a slight behaviour change: int-to-float conversions now use RTE instead of RTZ. For 32-bit opcodes, this affects conversions of integers with magnitude greater than 2^23 by at most 1 ulp. As this behaviour is unspecified in GLSL, this change is believed to be acceptable. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15187>
2026-05-07 07:08:04 +02:00 · 2022-02-27 15:46:17 -05:00 · 2022-02-27 15:46:17 -05:00 · 1fb4427a7a
commit 1fb4427a7a
parent a747708b9d
10 changed files with 209 additions and 209 deletions
--- a/src/panfrost/bifrost/bi_builder.h.py
+++ b/src/panfrost/bifrost/bi_builder.h.py
@ -19,7 +19,9 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.

-SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen", "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem", "not_result", "skip"])
+SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen",
+    "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem",
+    "not_result", "skip", "round"])

 TEMPLATE = """
 #ifndef _BI_BUILDER_H_
@ -99,10 +101,13 @@ bi_instr * bi_${opcode.replace('.', '_').lower()}${to_suffix(ops[opcode])}(${sig
    I->src[${src}] = src${src};
 % endfor
 % for mod in ops[opcode]["modifiers"]:
-% if not should_skip(mod):
+% if not should_skip(mod, opcode):
    I->${mod} = ${mod};
 % endif
 % endfor
+% if ops[opcode]["rtz"]:
+    I->round = BI_ROUND_RTZ;
+% endif
 % for imm in ops[opcode]["immediates"]:
    I->${imm} = ${imm};
 % endfor
@ -170,11 +175,16 @@ modifier_lists = order_modifiers(ir_instructions)

 # Generate type signature for a builder routine

-def should_skip(mod):
+def should_skip(mod, op):
+    # FROUND and HADD only make sense in context of a round mode, so override
+    # the usual skip
+    if mod == "round" and ("FROUND" in op or "HADD" in op):
+        return False
+
    return mod in SKIP or mod[0:-1] in SKIP

 def modifier_signature(op):
-    return sorted([m for m in op["modifiers"].keys() if not should_skip(m)])
+    return sorted([m for m in op["modifiers"].keys() if not should_skip(m, op["key"])])

 def signature(op, modifiers, typeful = False, sized = False, no_dests = False):
    return ", ".join(
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@ -147,7 +147,7 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)

                if (sz == 16) {
                        f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0),
-                                        bi_imm_f16(128.0), BI_ROUND_NONE);
+                                        bi_imm_f16(128.0));
                } else {
                        assert(sz == 32);
                        bi_index f[2];
@ -155,13 +155,13 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
                                f[i] = bi_fadd_rscale_f32(b,
                                                bi_word(offset, i),
                                                bi_imm_f32(0.5), bi_imm_u32(8),
-                                                BI_ROUND_NONE, BI_SPECIAL_NONE);
+                                                BI_SPECIAL_NONE);
                        }

-                        f16 = bi_v2f32_to_v2f16(b, f[0], f[1], BI_ROUND_NONE);
+                        f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
                }

-                return bi_v2f16_to_v2s16(b, f16, BI_ROUND_RTZ);
+                return bi_v2f16_to_v2s16(b, f16);
        }

        case nir_intrinsic_load_barycentric_pixel:
@ -1244,7 +1244,7 @@ bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
        for (unsigned i = 0; i < 2; ++i) {
                src[i] = bi_fadd_f32(b,
                                bi_u16_to_f32(b, bi_half(bi_register(59), i)),
-                                bi_imm_f32(0.5f), BI_ROUND_NONE);
+                                bi_imm_f32(0.5f));
        }

        for (unsigned i = 0; i < 2; ++i) {
@ -1691,7 +1691,7 @@ bi_nir_round(nir_op op)
 static bi_index
 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
 {
-        return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f), BI_ROUND_NONE);
+        return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
 }

 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
@ -1704,9 +1704,8 @@ bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
        bi_index m  = bi_frexpm_f32(b, s0, false, false);
        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, false);
        bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0),
-                        bi_zero(), BI_ROUND_NONE, BI_SPECIAL_N);
-        bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e,
-                        BI_ROUND_NONE, BI_SPECIAL_NONE);
+                        bi_zero(), BI_SPECIAL_N);
+        bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
 }

 static void
@ -1717,9 +1716,8 @@ bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
        bi_index e  = bi_frexpe_f32(b, bi_neg(s0), false, true);
        bi_index t1 = bi_fmul_f32(b, x1, x1);
        bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
-                        bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_N);
-        bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e,
-                        BI_ROUND_NONE, BI_SPECIAL_N);
+                        bi_imm_u32(-1), BI_SPECIAL_N);
+        bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
 }

 /* More complex transcendentals, see
@ -1730,26 +1728,23 @@ static void
 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
 {
        bi_index t1 = bi_temp(b->shader);
-        bi_instr *t1_instr = bi_fadd_f32_to(b, t1,
-                        s0, bi_imm_u32(0x49400000), BI_ROUND_NONE);
+        bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
        t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;

-        bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000), BI_ROUND_NONE);
+        bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));

-        bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader),
-                        s0, bi_neg(t2), BI_ROUND_NONE);
+        bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
        a2->clamp = BI_CLAMP_CLAMP_M1_1;

        bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
        bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
        bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
        bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
-                        bi_imm_u32(0x3e75fffa), BI_ROUND_NONE);
-        bi_index p2 = bi_fma_f32(b, p1, a2->dest[0],
-                        bi_imm_u32(0x3f317218), BI_ROUND_NONE);
+                        bi_imm_u32(0x3e75fffa));
+        bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
        bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
        bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader),
-                        p3, a1t, a1t, a1i, BI_ROUND_NONE, BI_SPECIAL_NONE);
+                        p3, a1t, a1t, a1i, BI_SPECIAL_NONE);
        x->clamp = BI_CLAMP_CLAMP_0_INF;

        bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
@ -1762,12 +1757,13 @@ bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
        /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
         * fixed-point input */
        bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
-                        bi_imm_u32(24), BI_ROUND_NONE, BI_SPECIAL_NONE);
-        bi_index fixed_pt = bi_f32_to_s32(b, scale, BI_ROUND_NONE);
+                        bi_imm_u32(24), BI_SPECIAL_NONE);
+        bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
+        fixed_pt->round = BI_ROUND_NONE; // XXX

        /* Compute the result for the fixed-point input, but pass along
         * the floating-point scale for correct NaN propagation */
-        bi_fexp_f32_to(b, dst, fixed_pt, scale);
+        bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
 }

 static void
@ -1776,7 +1772,7 @@ bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
        /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
        bi_index a1 = bi_frexpm_f32(b, s0, true, false);
        bi_index ei = bi_frexpe_f32(b, s0, true, false);
-        bi_index ef = bi_s32_to_f32(b, ei, BI_ROUND_RTZ);
+        bi_index ef = bi_s32_to_f32(b, ei);

        /* xt estimates -log(r1), a coarse approximation of log(a1) */
        bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
@ -1785,33 +1781,32 @@ bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
        /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
         * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
         * and then log(s0) = x1 + x2 */
-        bi_index x1 = bi_fadd_f32(b, ef, xt, BI_ROUND_NONE);
+        bi_index x1 = bi_fadd_f32(b, ef, xt);

        /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
         * polynomial approximation around 1. The series is expressed around
         * 1, so set y = (a1 * r1) - 1.0 */
-        bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0), BI_ROUND_NONE);
+        bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));

        /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
         * log_e(1 + y) by the Taylor series (lower precision than the blob):
         * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
        bi_index loge = bi_fmul_f32(b, y,
-                bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0), BI_ROUND_NONE));
+                bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));

        bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));

        /* log(s0) = x1 + x2 */
-        bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE);
+        bi_fadd_f32_to(b, dst, x1, x2);
 }

 static void
 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
 {
        bi_index frexp = bi_frexpe_f32(b, s0, true, false);
-        bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ);
+        bi_index frexpi = bi_s32_to_f32(b, frexp);
        bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
-        bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi,
-                        BI_ROUND_NONE);
+        bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
 }

 static void
@ -1862,12 +1857,11 @@ static void
 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
 {
        /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
-        bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE);
+        bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);

        /* Approximate domain error (small) */
-        bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS),
-                                BI_ROUND_NONE),
-                        MPI_OVER_TWO, s0, BI_ROUND_NONE);
+        bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
+                        MPI_OVER_TWO, s0);

        /* Lookup sin(x), cos(x) */
        bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
@ -1875,21 +1869,21 @@ bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)

        /* e^2 / 2 */
        bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(),
-                        bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE);
+                        bi_imm_u32(-1), BI_SPECIAL_NONE);

        /* (-e^2)/2 f''(x) */
        bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
                        cos ? cosx : sinx,
-                        bi_negzero(),  BI_ROUND_NONE);
+                        bi_negzero());

        /* e f'(x) - (e^2/2) f''(x) */
        bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
                        cos ? bi_neg(sinx) : cosx,
-                        quadratic, BI_ROUND_NONE);
+                        quadratic);
        I->clamp = BI_CLAMP_CLAMP_M1_1;

        /* f(x) + e f'(x) - (e^2/2) f''(x) */
-        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE);
+        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
 }

 /* The XOR lane op is useful for derivative calculation, but was added in v7.
@ -2056,7 +2050,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                bi_index s1 = comps > 1 ?
                        bi_word(idx, instr->src[0].swizzle[1]) : s0;

-                bi_v2f32_to_v2f16_to(b, dst, s0, s1, BI_ROUND_NONE);
+                bi_v2f32_to_v2f16_to(b, dst, s0, s1);
                return;

        /* Vectorized downcasts */
@ -2095,9 +2089,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                                          bi_half(s1, false));

                if (instr->op == nir_op_u2f16)
-                        bi_v2u16_to_v2f16_to(b, dst, t, BI_ROUND_NONE);
+                        bi_v2u16_to_v2f16_to(b, dst, t);
                else
-                        bi_v2s16_to_v2f16_to(b, dst, t, BI_ROUND_NONE);
+                        bi_v2s16_to_v2f16_to(b, dst, t);

                return;
        }
@ -2158,18 +2152,18 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)

        switch (instr->op) {
        case nir_op_ffma:
-                bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE);
+                bi_fma_to(b, sz, dst, s0, s1, s2);
                break;

        case nir_op_fmul:
-                bi_fma_to(b, sz, dst, s0, s1, bi_negzero(), BI_ROUND_NONE);
+                bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
                break;

        case nir_op_fsub:
                s1 = bi_neg(s1);
                FALLTHROUGH;
        case nir_op_fadd:
-                bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE);
+                bi_fadd_to(b, sz, dst, s0, s1);
                break;

        case nir_op_fsat: {
@ -2245,7 +2239,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                break;

        case nir_op_ldexp:
-                bi_ldexp_to(b, sz, dst, s0, s1, BI_ROUND_NONE);
+                bi_ldexp_to(b, sz, dst, s0, s1);
                break;

        case nir_op_b8csel:
@ -2290,7 +2284,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
        case nir_op_fddy_must_abs_mali: {
                bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2);
                bi_index adjacent = bi_clper_xor(b, s0, bit);
-                bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0), BI_ROUND_NONE);
+                bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0));
                break;
        }

@ -2355,7 +2349,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                                        BI_SUBGROUP_SUBGROUP4);
                }

-                bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE);
+                bi_fadd_to(b, sz, dst, right, bi_neg(left));
                break;
        }

@ -2365,45 +2359,45 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)

        case nir_op_f2i32:
                if (src_sz == 32)
-                        bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f32_to_s32_to(b, dst, s0);
                else
-                        bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f16_to_s32_to(b, dst, s0);
                break;

        /* Note 32-bit sources => no vectorization, so 32-bit works */
        case nir_op_f2u16:
                if (src_sz == 32)
-                        bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f32_to_u32_to(b, dst, s0);
                else
-                        bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_v2f16_to_v2u16_to(b, dst, s0);
                break;

        case nir_op_f2i16:
                if (src_sz == 32)
-                        bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f32_to_s32_to(b, dst, s0);
                else
-                        bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_v2f16_to_v2s16_to(b, dst, s0);
                break;

        case nir_op_f2u32:
                if (src_sz == 32)
-                        bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f32_to_u32_to(b, dst, s0);
                else
-                        bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_f16_to_u32_to(b, dst, s0);
                break;

        case nir_op_u2f16:
                if (src_sz == 32)
-                        bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
+                        bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
                else if (src_sz == 16)
-                        bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_v2u16_to_v2f16_to(b, dst, s0);
                else if (src_sz == 8)
                        bi_v2u8_to_v2f16_to(b, dst, s0);
                break;

        case nir_op_u2f32:
                if (src_sz == 32)
-                        bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_u32_to_f32_to(b, dst, s0);
                else if (src_sz == 16)
                        bi_u16_to_f32_to(b, dst, s0);
                else
@ -2412,9 +2406,9 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)

        case nir_op_i2f16:
                if (src_sz == 32)
-                        bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ);
+                        bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
                else if (src_sz == 16)
-                        bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_v2s16_to_v2f16_to(b, dst, s0);
                else if (src_sz == 8)
                        bi_v2s8_to_v2f16_to(b, dst, s0);
                break;
@ -2423,7 +2417,7 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                assert(src_sz == 32 || src_sz == 16 || src_sz == 8);

                if (src_sz == 32)
-                        bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
+                        bi_s32_to_f32_to(b, dst, s0);
                else if (src_sz == 16)
                        bi_s16_to_f32_to(b, dst, s0);
                else if (src_sz == 8)
@ -2732,7 +2726,9 @@ bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
         * 0, dt - 1). So we use round RTE, clamping is handled at the data
         * structure level */

-        return bi_f32_to_u32(b, idx, BI_ROUND_NONE);
+        bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
+        I->round = BI_ROUND_NONE;
+        return I->dest[0];
 }

 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
@ -2760,16 +2756,15 @@ bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)

        bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader),
                        fp16 ? bi_half(lod, false) : lod,
-                        bi_imm_f32(1.0f / max_lod), bi_negzero(), BI_ROUND_NONE);
+                        bi_imm_f32(1.0f / max_lod), bi_negzero());

        fsat->clamp = BI_CLAMP_CLAMP_M1_1;

        bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f),
-                        bi_negzero(), BI_ROUND_NONE);
+                        bi_negzero());

        return bi_mkvec_v2i16(b,
-                        bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false),
-                        bi_imm_u16(0));
+                        bi_half(bi_f32_to_s32(b, fmul), false), bi_imm_u16(0));
 }

 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
@ -2911,17 +2906,14 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord,
        bi_index rcp = bi_frcp_f32(b, maxxyz);

        /* Calculate 0.5 * (1.0 / max{x, y, z}) */
-        bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero(),
-                        BI_ROUND_NONE);
+        bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());

        /* Transform the coordinates */
        *s = bi_temp(b->shader);
        *t = bi_temp(b->shader);

-        bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f),
-                        BI_ROUND_NONE);
-        bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f),
-                        BI_ROUND_NONE);
+        bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
+        bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));

        S->clamp = BI_CLAMP_CLAMP_0_1;
        T->clamp = BI_CLAMP_CLAMP_0_1;
--- a/src/panfrost/bifrost/bifrost_isa.py
+++ b/src/panfrost/bifrost/bifrost_isa.py
@ -278,6 +278,7 @@ def combine_ir_variants(instructions, key):
    # Great, we've checked srcs/immediates are consistent and we've summed over
    # modifiers
    return {
+            'key': key,
            'srcs': variants[0]['srcs'],
            'dests': variants[0]['dests'],
            'staging': variants[0]['staging'],
--- a/src/panfrost/bifrost/test/test-constant-fold.cpp
+++ b/src/panfrost/bifrost/test/test-constant-fold.cpp
@ -186,6 +186,6 @@ TEST_F(ConstantFold, OtherOperationsShouldNotFold)
   bi_index zero = bi_fau(bir_fau(BIR_FAU_IMMEDIATE | 0), false);
   bi_index reg = bi_register(0);

-   EXPECT_NOT_FOLD(bi_fma_f32_to(b, reg, zero, zero, zero, BI_ROUND_NONE));
-   EXPECT_NOT_FOLD(bi_fadd_f32_to(b, reg, zero, zero, BI_ROUND_NONE));
+   EXPECT_NOT_FOLD(bi_fma_f32_to(b, reg, zero, zero, zero));
+   EXPECT_NOT_FOLD(bi_fadd_f32_to(b, reg, zero, zero));
 }
--- a/src/panfrost/bifrost/test/test-optimizer.cpp
+++ b/src/panfrost/bifrost/test/test-optimizer.cpp
@ -63,17 +63,17 @@ protected:

 TEST_F(Optimizer, FusedFABSNEG)
 {
-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_abs(x), y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), y),
+        bi_fadd_f32_to(b, reg, bi_abs(x), y));

-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_neg(x)), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_neg(x), y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_neg(x)), y),
+        bi_fadd_f32_to(b, reg, bi_neg(x), y));

-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, negabsx), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, negabsx, y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, negabsx), y),
+        bi_fadd_f32_to(b, reg, negabsx, y));

-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, x), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, x), y),
+        bi_fadd_f32_to(b, reg, x, y));

   CASE(bi_fmin_f32_to(b, reg, bi_fabsneg_f32(b, negabsx), bi_neg(y)),
        bi_fmin_f32_to(b, reg, negabsx, bi_neg(y)));
@ -81,8 +81,8 @@ TEST_F(Optimizer, FusedFABSNEG)

 TEST_F(Optimizer, FusedFABSNEGForFP16)
 {
-   CASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, negabsx), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, negabsx, y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, negabsx), y),
+        bi_fadd_v2f16_to(b, reg, negabsx, y));

   CASE(bi_fmin_v2f16_to(b, reg, bi_fabsneg_v2f16(b, negabsx), bi_neg(y)),
        bi_fmin_v2f16_to(b, reg, negabsx, bi_neg(y)));
@ -91,26 +91,26 @@ TEST_F(Optimizer, FusedFABSNEGForFP16)
 TEST_F(Optimizer, FuseFADD_F32WithEqualSourcesAbsAbsAndClamp)
 {
   CASE({
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_abs(x)), bi_abs(x));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   }, {
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   CASE({
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_fabsneg_f32(b, bi_abs(x)), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_fabsneg_f32(b, bi_abs(x)));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   }, {
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   CASE({
-         bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), bi_abs(x), BI_ROUND_NONE));
+         bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), bi_abs(x)));
         I->clamp = BI_CLAMP_CLAMP_0_INF;
   }, {
-         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_f32_to(b, reg, bi_abs(x), bi_abs(x));
         I->clamp = BI_CLAMP_CLAMP_0_INF;
   });
 }
@ -118,26 +118,26 @@ TEST_F(Optimizer, FuseFADD_F32WithEqualSourcesAbsAbsAndClamp)
 TEST_F(Optimizer, FuseFADD_V2F16WithDifferentSourcesAbsAbsAndClamp)
 {
   CASE({
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(y), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(y));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   }, {
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   CASE({
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(y)), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(y)));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   }, {
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   CASE({
-         bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(y), BI_ROUND_NONE));
+         bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(y)));
         I->clamp = BI_CLAMP_CLAMP_0_INF;
   }, {
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_abs(y));
         I->clamp = BI_CLAMP_CLAMP_0_INF;
   });
 }
@ -145,57 +145,57 @@ TEST_F(Optimizer, FuseFADD_V2F16WithDifferentSourcesAbsAbsAndClamp)
 TEST_F(Optimizer, AvoidFADD_V2F16WithEqualSourcesAbsAbsAndClamp)
 {
   NEGCASE({
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(x), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_abs(x)), bi_abs(x));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   NEGCASE({
-         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(x)), BI_ROUND_NONE);
+         bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_abs(x), bi_fabsneg_v2f16(b, bi_abs(x)));
         I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   NEGCASE({
-      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(x), BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, bi_abs(x), bi_abs(x)));
      I->clamp = BI_CLAMP_CLAMP_0_INF;
   });
 }

 TEST_F(Optimizer, SwizzlesComposedForFP16)
 {
-   CASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), y),
+        bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y));

-   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, negabsx), true, false), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, negabsx), true, false), y),
+        bi_fadd_v2f16_to(b, reg, bi_swz_16(negabsx, true, false), y));

-   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), true, false), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, negabsx, y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_swz_16(negabsx, true, false)), true, false), y),
+        bi_fadd_v2f16_to(b, reg, negabsx, y));

-   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, false)), true, false), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, bi_half(negabsx, false), y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, false)), true, false), y),
+        bi_fadd_v2f16_to(b, reg, bi_half(negabsx, false), y));

-   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, true)), true, false), y, BI_ROUND_NONE),
-        bi_fadd_v2f16_to(b, reg, bi_half(negabsx, true), y, BI_ROUND_NONE));
+   CASE(bi_fadd_v2f16_to(b, reg, bi_swz_16(bi_fabsneg_v2f16(b, bi_half(negabsx, true)), true, false), y),
+        bi_fadd_v2f16_to(b, reg, bi_half(negabsx, true), y));
 }

 TEST_F(Optimizer, PreserveWidens)
 {
   /* Check that widens are passed through */
-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(negabsx, false)), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_half(negabsx, false), y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(negabsx, false)), y),
+        bi_fadd_f32_to(b, reg, bi_half(negabsx, false), y));

-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(negabsx, true)), y, BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_half(negabsx, true), y, BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(negabsx, true)), y),
+        bi_fadd_f32_to(b, reg, bi_half(negabsx, true), y));

-   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(x, true)), bi_fabsneg_f32(b, bi_half(x, false)), BI_ROUND_NONE),
-        bi_fadd_f32_to(b, reg, bi_half(x, true), bi_half(x, false), BI_ROUND_NONE));
+   CASE(bi_fadd_f32_to(b, reg, bi_fabsneg_f32(b, bi_half(x, true)), bi_fabsneg_f32(b, bi_half(x, false))),
+        bi_fadd_f32_to(b, reg, bi_half(x, true), bi_half(x, false)));
 }

 TEST_F(Optimizer, DoNotMixSizesForFABSNEG)
 {
   /* Refuse to mix sizes for fabsneg, that's wrong */
-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fabsneg_v2f16(b, negabsx), y, BI_ROUND_NONE));
-   NEGCASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_f32(b, negabsx), y, BI_ROUND_NONE));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fabsneg_v2f16(b, negabsx), y));
+   NEGCASE(bi_fadd_v2f16_to(b, reg, bi_fabsneg_f32(b, negabsx), y));
 }

 TEST_F(Optimizer, AvoidZeroAndFABSNEGFootguns)
@ -206,27 +206,27 @@ TEST_F(Optimizer, AvoidZeroAndFABSNEGFootguns)

   bi_index zero = bi_zero();

-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), zero, BI_ROUND_NONE), y, BI_ROUND_NONE));
-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_neg(x), zero, BI_ROUND_NONE), y, BI_ROUND_NONE));
-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_neg(bi_abs(x)), zero, BI_ROUND_NONE), y, BI_ROUND_NONE));
-   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, x, zero, BI_ROUND_NONE), y, BI_ROUND_NONE));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_abs(x), zero), y));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_neg(x), zero), y));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, bi_neg(bi_abs(x)), zero), y));
+   NEGCASE(bi_fadd_f32_to(b, reg, bi_fadd_f32(b, x, zero), y));
 }

 TEST_F(Optimizer, ClampsPropagated)
 {
   CASE({
-      bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, x, y, BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_f32(b, x, y));
      I->clamp = BI_CLAMP_CLAMP_0_INF;
   }, {
-      bi_instr *I = bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, x, y);
      I->clamp = BI_CLAMP_CLAMP_0_INF;
   });

   CASE({
-      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y, BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y));
      I->clamp = BI_CLAMP_CLAMP_0_1;
   }, {
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y);
      I->clamp = BI_CLAMP_CLAMP_0_1;
   });
 }
@ -235,62 +235,62 @@ TEST_F(Optimizer, ClampsPropagated)
 TEST_F(Optimizer, ClampsComposed)
 {
   CASE({
-      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y);
      bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]);
      I->clamp = BI_CLAMP_CLAMP_M1_1;
      J->clamp = BI_CLAMP_CLAMP_0_INF;
   }, {
-      bi_instr *I = bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, x, y);
      I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   CASE({
-      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y);
      bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]);
      I->clamp = BI_CLAMP_CLAMP_0_1;
      J->clamp = BI_CLAMP_CLAMP_0_INF;
   }, {
-      bi_instr *I = bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, x, y);
      I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   CASE({
-      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, bi_temp(b->shader), x, y);
      bi_instr *J = bi_fclamp_f32_to(b, reg, I->dest[0]);
      I->clamp = BI_CLAMP_CLAMP_0_INF;
      J->clamp = BI_CLAMP_CLAMP_0_INF;
   }, {
-      bi_instr *I = bi_fadd_f32_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, x, y);
      I->clamp = BI_CLAMP_CLAMP_0_INF;
   });

   CASE({
-      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y);
      bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]);
      I->clamp = BI_CLAMP_CLAMP_M1_1;
      J->clamp = BI_CLAMP_CLAMP_0_INF;
   }, {
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y);
      I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   CASE({
-      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y);
      bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]);
      I->clamp = BI_CLAMP_CLAMP_0_1;
      J->clamp = BI_CLAMP_CLAMP_0_INF;
   }, {
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y);
      I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   CASE({
-      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, bi_temp(b->shader), x, y);
      bi_instr *J = bi_fclamp_v2f16_to(b, reg, I->dest[0]);
      I->clamp = BI_CLAMP_CLAMP_0_INF;
      J->clamp = BI_CLAMP_CLAMP_0_INF;
   }, {
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, x, y);
      I->clamp = BI_CLAMP_CLAMP_0_INF;
   });
 }
@ -298,12 +298,12 @@ TEST_F(Optimizer, ClampsComposed)
 TEST_F(Optimizer, DoNotMixSizesWhenClamping)
 {
   NEGCASE({
-      bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_v2f16(b, x, y, BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_f32_to(b, reg, bi_fadd_v2f16(b, x, y));
      I->clamp = BI_CLAMP_CLAMP_0_1;
   });

   NEGCASE({
-      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_f32(b, x, y, BI_ROUND_NONE));
+      bi_instr *I = bi_fclamp_v2f16_to(b, reg, bi_fadd_f32(b, x, y));
      I->clamp = BI_CLAMP_CLAMP_0_1;
   });
 }
@ -314,12 +314,12 @@ TEST_F(Optimizer, DoNotUseAdditionByZeroForClamps)

   /* We can't use addition by 0.0 for clamps due to signed zeros. */
   NEGCASE({
-      bi_instr *I = bi_fadd_f32_to(b, reg, bi_fadd_f32(b, x, y, BI_ROUND_NONE), zero, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_f32_to(b, reg, bi_fadd_f32(b, x, y), zero);
      I->clamp = BI_CLAMP_CLAMP_M1_1;
   });

   NEGCASE({
-      bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y, BI_ROUND_NONE), zero, BI_ROUND_NONE);
+      bi_instr *I = bi_fadd_v2f16_to(b, reg, bi_fadd_v2f16(b, x, y), zero);
      I->clamp = BI_CLAMP_CLAMP_0_1;
   });
 }
--- a/src/panfrost/bifrost/test/test-scheduler-predicates.cpp
+++ b/src/panfrost/bifrost/test/test-scheduler-predicates.cpp
@ -56,7 +56,7 @@ TEST_F(SchedulerPredicates, MOV)

 TEST_F(SchedulerPredicates, FMA)
 {
-   bi_instr *fma = bi_fma_f32_to(b, TMP(), TMP(), TMP(), bi_zero(), BI_ROUND_NONE);
+   bi_instr *fma = bi_fma_f32_to(b, TMP(), TMP(), TMP(), bi_zero());
   ASSERT_TRUE(bi_can_fma(fma));
   ASSERT_FALSE(bi_can_add(fma));
   ASSERT_FALSE(bi_must_message(fma));
@ -96,12 +96,12 @@ TEST_F(SchedulerPredicates, BLEND)

 TEST_F(SchedulerPredicates, RestrictionsOnModifiersOfSameCycleTemporaries)
 {
-   bi_instr *fadd = bi_fadd_f32_to(b, TMP(), TMP(), TMP(), BI_ROUND_NONE);
+   bi_instr *fadd = bi_fadd_f32_to(b, TMP(), TMP(), TMP());
   ASSERT_TRUE(bi_reads_t(fadd, 0));

   for (unsigned i = 0; i < 2; ++i) {
      for (unsigned j = 0; j < 2; ++j) {
-         bi_instr *fadd = bi_fadd_f32_to(b, TMP(), TMP(), TMP(), BI_ROUND_NONE);
+         bi_instr *fadd = bi_fadd_f32_to(b, TMP(), TMP(), TMP());
         fadd->src[i] = bi_swz_16(TMP(), j, j);
         ASSERT_TRUE(bi_reads_t(fadd, 1 - i));
         ASSERT_FALSE(bi_reads_t(fadd, i));
@ -115,7 +115,7 @@ TEST_F(SchedulerPredicates, RestrictionsOnFAddV2F16)
   bi_index y = bi_register(1);

   /* Basic */
-   bi_instr *fadd = bi_fadd_v2f16_to(b, TMP(), x, x, BI_ROUND_NONE);
+   bi_instr *fadd = bi_fadd_v2f16_to(b, TMP(), x, x);

   ASSERT_TRUE(bi_can_fma(fadd));
   ASSERT_TRUE(bi_can_add(fadd));
--- a/src/panfrost/bifrost/valhall/test/test-add-imm.cpp
+++ b/src/panfrost/bifrost/valhall/test/test-add-imm.cpp
@ -60,45 +60,37 @@ TEST_F(AddImm, Basic) {
   CASE(bi_mov_i32_to(b, bi_register(63), bi_imm_u32(0xABAD1DEA)),
        bi_iadd_imm_i32_to(b, bi_register(63), bi_zero(), 0xABAD1DEA));

-   CASE(bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0)),
        bi_fadd_imm_f32_to(b, bi_register(1), bi_register(2), fui(42.0)));

-   CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_imm_f32(42.0), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_imm_f32(42.0)),
        bi_fadd_imm_f32_to(b, bi_register(1), bi_discard(bi_register(2)), fui(42.0)));

-   CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_neg(bi_imm_f32(42.0)), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(1), bi_discard(bi_register(2)), bi_neg(bi_imm_f32(42.0))),
        bi_fadd_imm_f32_to(b, bi_register(1), bi_discard(bi_register(2)), fui(-42.0)));
 }

 TEST_F(AddImm, Commutativty) {
-   CASE(bi_fadd_f32_to(b, bi_register(1), bi_imm_f32(42.0), bi_register(2), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(1), bi_imm_f32(42.0), bi_register(2)),
        bi_fadd_imm_f32_to(b, bi_register(1), bi_register(2), fui(42.0)));
 }

 TEST_F(AddImm, NoModifiers) {
-   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0),
-            BI_ROUND_RTP));
-
-   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_abs(bi_register(2)), bi_imm_f32(42.0),
-            BI_ROUND_NONE));
-
-   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_neg(bi_register(2)), bi_imm_f32(42.0),
-            BI_ROUND_NONE));
-
-   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_f32(42.0),
-            BI_ROUND_NONE));
+   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_abs(bi_register(2)), bi_imm_f32(42.0)));
+   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_neg(bi_register(2)), bi_imm_f32(42.0)));
+   NEGCASE(bi_fadd_f32_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_f32(42.0)));
 }

 TEST_F(AddImm, NoClamp) {
   NEGCASE({
      bi_instr *I = bi_fadd_f32_to(b, bi_register(1), bi_register(2),
-            bi_imm_f32(42.0), BI_ROUND_NONE);
+            bi_imm_f32(42.0));
      I->clamp = BI_CLAMP_CLAMP_M1_1;
   });
 }

 TEST_F(AddImm, OtherTypes) {
-   CASE(bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0), BI_ROUND_NONE),
+   CASE(bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0)),
        bi_fadd_imm_v2f16_to(b, bi_register(1), bi_register(2), 0x51405140));

   CASE(bi_iadd_u32_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false),
@ -119,7 +111,6 @@ TEST_F(AddImm, OtherTypes) {
   CASE(bi_iadd_v4s8_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), false),
        bi_iadd_imm_v4i8_to(b, bi_register(1), bi_register(2), 0xDEADBEEF));

-   NEGCASE(bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0), BI_ROUND_RTZ));
   NEGCASE(bi_iadd_u32_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_u32(0xDEADBEEF), false));
   NEGCASE(bi_iadd_v2u16_to(b, bi_register(1), bi_swz_16(bi_register(2), false, false), bi_imm_u32(0xDEADBEEF), false));
   NEGCASE(bi_iadd_u32_to(b, bi_register(1), bi_register(2), bi_imm_u32(0xDEADBEEF), true));
@ -135,3 +126,16 @@ TEST_F(AddImm, Int8) {
   NEGCASE(bi_iadd_v4u8_to(b, bi_register(1), idx, bi_imm_u32(0xDEADBEEF), false));
   NEGCASE(bi_iadd_v4s8_to(b, bi_register(1), idx, bi_imm_u32(0xDEADBEEF), false));
 }
+
+TEST_F(AddImm, OnlyRTE) {
+   NEGCASE({
+         bi_instr *I = bi_fadd_f32_to(b, bi_register(1), bi_register(2), bi_imm_f32(42.0));
+         I->round = BI_ROUND_RTP;
+   });
+
+   NEGCASE({
+         bi_instr *I = bi_fadd_v2f16_to(b, bi_register(1), bi_register(2), bi_imm_f16(42.0));
+         I->round = BI_ROUND_RTZ;
+   });
+}
+
--- a/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp
+++ b/src/panfrost/bifrost/valhall/test/test-lower-isel.cpp
@ -102,7 +102,7 @@ TEST_F(LowerIsel, IntegerCSEL) {
 }

 TEST_F(LowerIsel, Smoke) {
-   NEGCASE(bi_fadd_f32_to(b, reg, reg, reg, BI_ROUND_RTP));
+   NEGCASE(bi_fadd_f32_to(b, reg, reg, reg));
   NEGCASE(bi_csel_s32_to(b, reg, reg, reg, reg, reg, BI_CMPF_LT));
   NEGCASE(bi_csel_u32_to(b, reg, reg, reg, reg, reg, BI_CMPF_LT));
 }
--- a/src/panfrost/bifrost/valhall/test/test-packing.cpp
+++ b/src/panfrost/bifrost/valhall/test/test-packing.cpp
@ -65,42 +65,42 @@ TEST_F(ValhallPacking, Moves) {
 }

 TEST_F(ValhallPacking, Fadd) {
-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2)),
         0x00a4c00000000201ULL);
-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2)), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))),
         0x00a4c02000000201ULL);
-   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2)), BI_ROUND_NONE),
+   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))),
         0x00a4c01000000201ULL);

   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_swz_16(bi_register(1), false, false),
-                         bi_swz_16(bi_register(0), true, true), BI_ROUND_NONE),
+                         bi_swz_16(bi_register(0), true, true)),
         0x00a5c0000c000001ULL);

-   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0), BI_ROUND_NONE),
+   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0)),
         0x00a5c00028000001ULL);

   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1),
-                         bi_swz_16(bi_register(0), true, false), BI_ROUND_NONE),
+                         bi_swz_16(bi_register(0), true, false)),
         0x00a5c00024000001ULL);

   CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))),
-                         bi_neg(zero), BI_ROUND_NONE),
+                         bi_neg(zero)),
         0x00a5c0902800c040ULL);

   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       zero, BI_ROUND_NONE),
+                       zero),
         0x00a4c0000000c001ULL);

   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       bi_neg(zero), BI_ROUND_NONE),
+                       bi_neg(zero)),
         0x00a4c0100000c001ULL);

   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       bi_half(bi_register(0), true), BI_ROUND_NONE),
+                       bi_half(bi_register(0), true)),
         0x00a4c00008000001ULL);

   CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-                       bi_half(bi_register(0), false), BI_ROUND_NONE),
+                       bi_half(bi_register(0), false)),
         0x00a4c00004000001ULL);
 }

@ -112,8 +112,7 @@ TEST_F(ValhallPacking, Clper) {

 TEST_F(ValhallPacking, Clamps) {
   bi_instr *I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
-         bi_neg(bi_abs(bi_register(2))),
-         BI_ROUND_NONE);
+                                bi_neg(bi_abs(bi_register(2))));
   CASE(I, 0x00a4c03000000201ULL);

   I->clamp = BI_CLAMP_CLAMP_M1_1;
@ -123,7 +122,7 @@ TEST_F(ValhallPacking, Clamps) {
 TEST_F(ValhallPacking, Misc) {
   CASE(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)),
                         bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 4), false),
-                         bi_neg(zero), BI_ROUND_NONE),
+                         bi_neg(zero)),
         0x00b2c10400c08841ULL);

   CASE(bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))),
@ -164,7 +163,7 @@ TEST_F(ValhallPacking, Comparions) {
 }

 TEST_F(ValhallPacking, Conversions) {
-   CASE(bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)), BI_ROUND_NONE),
+   CASE(bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2))),
         0x0090c22000070042);
 }

@ -219,7 +218,7 @@ TEST_F(ValhallPacking, Transcendentals) {
   CASE(bi_frsq_f32_to(b, bi_register(2), bi_register(1)),
        0x009cc20000020001);

-   CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)), bi_discard(bi_register(2)), bi_neg(zero), bi_discard(bi_register(0)), BI_ROUND_NONE, BI_SPECIAL_LEFT),
+   CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)), bi_discard(bi_register(2)), bi_neg(zero), bi_discard(bi_register(0)), BI_SPECIAL_LEFT),
        0x0162c00440c04241);
 }

--- a/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp
+++ b/src/panfrost/bifrost/valhall/test/test-validate-fau.cpp
@ -66,53 +66,47 @@ protected:
 TEST_F(ValidateFau, One64BitUniformSlot)
 {
   VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(3),
-            unif, BI_ROUND_NONE));
+            unif));
   VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_word(unif, 1),
-            unif, BI_ROUND_NONE));
-   VALID(bi_fma_f32_to(b, bi_register(1), unif, unif, bi_word(unif, 1),
-            BI_ROUND_NONE));
-   INVALID(bi_fma_f32_to(b, bi_register(1), unif, unif2, bi_register(1),
-            BI_ROUND_NONE));
-   INVALID(bi_fma_f32_to(b, bi_register(1), unif, unif2, bi_word(unif, 1),
-            BI_ROUND_NONE));
+            unif));
+   VALID(bi_fma_f32_to(b, bi_register(1), unif, unif, bi_word(unif, 1)));
+   INVALID(bi_fma_f32_to(b, bi_register(1), unif, unif2, bi_register(1)));
+   INVALID(bi_fma_f32_to(b, bi_register(1), unif, unif2, bi_word(unif, 1)));

   /* Crafted case that appears correct at first glance and was erronously
    * marked as valid in early versions of the validator.
    */
   INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2),
                         bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 0), false),
-                         bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 1), true),
-                         BI_ROUND_NONE));
+                         bi_fau((enum bir_fau) (BIR_FAU_UNIFORM | 1), true)));
 }

 TEST_F(ValidateFau, Combined64BitUniformsConstants)
 {
   VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_word(unif, 1),
-            unif, BI_ROUND_NONE));
-   VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), zero,
-            unif, BI_ROUND_NONE));
-   VALID(bi_fma_f32_to(b, bi_register(1), zero, imm1, imm1, BI_ROUND_NONE));
-   INVALID(bi_fma_f32_to(b, bi_register(1), zero, bi_word(unif, 1),
-            unif, BI_ROUND_NONE));
-   INVALID(bi_fma_f32_to(b, bi_register(1), zero, imm1, imm2, BI_ROUND_NONE));
+            unif));
+   VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), zero, unif));
+   VALID(bi_fma_f32_to(b, bi_register(1), zero, imm1, imm1));
+   INVALID(bi_fma_f32_to(b, bi_register(1), zero, bi_word(unif, 1), unif));
+   INVALID(bi_fma_f32_to(b, bi_register(1), zero, imm1, imm2));
 }

 TEST_F(ValidateFau, UniformsOnlyInDefaultMode)
 {
   INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_word(unif, 1),
-            lane_id, BI_ROUND_NONE));
+            lane_id));
   INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_word(unif, 1),
-            core_id, BI_ROUND_NONE));
+            core_id));
 }

 TEST_F(ValidateFau, SingleSpecialImmediate)
 {
   VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(2),
-            lane_id, BI_ROUND_NONE));
+            lane_id));
   VALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), bi_register(2),
-            core_id, BI_ROUND_NONE));
+            core_id));
   INVALID(bi_fma_f32_to(b, bi_register(1), bi_register(2), lane_id,
-            core_id, BI_ROUND_NONE));
+            core_id));
 }

 TEST_F(ValidateFau, SmokeTests)
@ -120,5 +114,5 @@ TEST_F(ValidateFau, SmokeTests)
   VALID(bi_mov_i32_to(b, bi_register(1), bi_register(2)));
   VALID(bi_mov_i32_to(b, bi_register(1), unif));
   VALID(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)),
-                        unif, bi_neg(zero), BI_ROUND_NONE));
+                        unif, bi_neg(zero)));
 }