From f05174e5a5a58cb86440457df3c501cfb3099adb Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Wed, 16 Dec 2020 13:58:12 -0500
Subject: [PATCH] pan/bi: Implement ALU with the builder

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8135>
---
 src/panfrost/bifrost/bifrost_compile.c | 396 +++++++++++++++++++++++++
 1 file changed, 396 insertions(+)

diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 397a7c257ba..34327fedf1a 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -2075,6 +2075,402 @@ bi_cmpf_nir(nir_op op)
         }
 }
 
+static void
+bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
+{
+        bi_index dst = bi_dest_index(&instr->dest.dest);
+        unsigned sz = nir_dest_bit_size(instr->dest.dest);
+
+        unsigned srcs = nir_op_infos[instr->op].num_inputs;
+        unsigned comps = nir_dest_num_components(instr->dest.dest);
+
+        if (!instr->dest.dest.is_ssa) {
+                for (unsigned i = 0; i < comps; ++i)
+                        assert(instr->dest.write_mask);
+        }
+
+        /* First, match against the various moves in NIR. These are
+         * special-cased because they can operate on vectors even after
+         * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
+         * instruction is no "bigger" than SIMD-within-a-register. These moves
+         * are the exceptions that need to handle swizzles specially. */
+
+        switch (instr->op) {
+        case nir_op_vec2:
+        case nir_op_vec3:
+        case nir_op_vec4: {
+                bi_index unoffset_srcs[4] = {
+                        srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(),
+                        srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(),
+                        srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(),
+                        srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(),
+                };
+
+                unsigned channels[4] = {
+                        instr->src[0].swizzle[0],
+                        instr->src[1].swizzle[0],
+                        srcs > 2 ? instr->src[2].swizzle[0] : 0,
+                        srcs > 3 ? instr->src[3].swizzle[0] : 0,
+                };
+
+                bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
+                return;
+        }
+
+        case nir_op_vec8:
+        case nir_op_vec16:
+                unreachable("should've been lowered");
+
+        case nir_op_mov: {
+                bi_index idx = bi_src_index(&instr->src[0].src);
+                bi_index unoffset_srcs[4] = { idx, idx, idx, idx };
+
+                unsigned channels[4] = {
+                        comps > 0 ? instr->src[0].swizzle[0] : 0,
+                        comps > 1 ? instr->src[0].swizzle[1] : 0,
+                        comps > 2 ? instr->src[0].swizzle[2] : 0,
+                        comps > 3 ? instr->src[0].swizzle[3] : 0,
+                };
+
+                bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, sz);
+                return;
+        }
+
+        default:
+                break;
+        }
+
+        bi_index s0 = srcs > 0 ? bi_alu_src_index(instr->src[0], comps) : bi_null();
+        bi_index s1 = srcs > 1 ? bi_alu_src_index(instr->src[1], comps) : bi_null();
+        bi_index s2 = srcs > 2 ? bi_alu_src_index(instr->src[2], comps) : bi_null();
+
+        unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
+
+        switch (instr->op) {
+        case nir_op_ffma:
+                bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE);
+                break;
+
+        case nir_op_fmul:
+                bi_fma_to(b, sz, dst, s0, s1, bi_zero(), BI_ROUND_NONE);
+                break;
+
+        case nir_op_fsub:
+                s1 = bi_neg(s1);
+                /* fallthrough */
+        case nir_op_fadd:
+                bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE);
+                break;
+
+        case nir_op_fsat: {
+                bi_instr *I = (sz == 32) ?
+                        bi_fadd_f32_to(b, dst, s0, bi_zero(), BI_ROUND_NONE) :
+                        bi_fma_v2f16_to(b, dst, s0, bi_zero(), bi_zero(),
+                                        BI_ROUND_NONE);
+
+                I->clamp = BI_CLAMP_CLAMP_0_1;
+                break;
+        }
+
+        case nir_op_fneg:
+                bi_fadd_to(b, sz, dst, bi_neg(s0), bi_zero(), BI_ROUND_NONE);
+                break;
+
+        case nir_op_fabs:
+                bi_fadd_to(b, sz, dst, bi_abs(s0), bi_zero(), BI_ROUND_NONE);
+                break;
+
+        case nir_op_fexp2: {
+                /* TODO G71 */
+                assert(sz == 32); /* should've been lowered */
+
+                /* multiply by 1.0 * 2*24 */
+                bi_index scale = bi_fma_rscale_f32(b, s0, bi_imm_f32(1.0f),
+                                bi_zero(), bi_imm_u32(24), BI_ROUND_NONE,
+                                BI_SPECIAL_NONE);
+
+                bi_fexp_f32_to(b, dst, bi_f32_to_s32(b, scale, BI_ROUND_NONE), s0);
+                break;
+        }
+
+        case nir_op_flog2: {
+                /* TODO G71 */
+                assert(sz == 32); /* should've been lowered */
+                bi_index frexp = bi_frexpe_f32(b, s0, true, false);
+                bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ);
+                bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
+                bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi,
+                                BI_ROUND_NONE);
+                break;
+        }
+
+        case nir_op_b8csel:
+        case nir_op_b16csel:
+        case nir_op_b32csel:
+                if (sz == 8)
+                        bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
+                else
+                        bi_csel_to(b, sz, dst, s0, bi_zero(), s1, s2, BI_CMPF_NE);
+                break;
+
+        case nir_op_ishl:
+                bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
+                break;
+        case nir_op_ushr:
+                bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
+                break;
+
+        case nir_op_ishr:
+                bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
+                break;
+
+        case nir_op_flt32:
+        case nir_op_fge32:
+        case nir_op_feq32:
+        case nir_op_fneu32:
+                bi_fcmp_to(b, sz, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                BI_RESULT_TYPE_M1);
+                break;
+
+        case nir_op_ieq32:
+        case nir_op_ine32:
+                if (sz == 32) {
+                        bi_icmp_i32_to(b, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                        BI_RESULT_TYPE_M1);
+                } else if (sz == 16) {
+                        bi_icmp_v2i16_to(b, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                        BI_RESULT_TYPE_M1);
+                } else {
+                        bi_icmp_v4i8_to(b, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                        BI_RESULT_TYPE_M1);
+                }
+                break;
+
+        case nir_op_ilt32:
+        case nir_op_ige32:
+                if (sz == 32) {
+                        bi_icmp_s32_to(b, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                        BI_RESULT_TYPE_M1);
+                } else if (sz == 16) {
+                        bi_icmp_v2s16_to(b, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                        BI_RESULT_TYPE_M1);
+                } else {
+                        bi_icmp_v4s8_to(b, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                        BI_RESULT_TYPE_M1);
+                }
+                break;
+
+        case nir_op_ult32:
+        case nir_op_uge32:
+                if (sz == 32) {
+                        bi_icmp_u32_to(b, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                        BI_RESULT_TYPE_M1);
+                } else if (sz == 16) {
+                        bi_icmp_v2u16_to(b, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                        BI_RESULT_TYPE_M1);
+                } else {
+                        bi_icmp_v4u8_to(b, dst, s0, s1, bi_cmpf_nir(instr->op),
+                                        BI_RESULT_TYPE_M1);
+                }
+                break;
+
+        case nir_op_fddx:
+        case nir_op_fddy: {
+                bi_index cur_lane = bi_mov_i32(b, bi_fau(BIR_FAU_LANE_ID, false));
+
+                bi_index lane1 = bi_lshift_and_i32(b, cur_lane,
+                                bi_imm_u32(instr->op == nir_op_fddx ? 2 : 1),
+                                bi_byte(bi_zero(), 0));
+
+                bi_index lane2 = bi_iadd_u32(b, lane1,
+                                bi_imm_u32(instr->op == nir_op_fddx ? 1 : 2),
+                                false);
+
+                bi_index left, right;
+
+                if (b->shader->arch == 6) {
+                        left = bi_clper_v6_i32(b, s0, lane1);
+                        right = bi_clper_v6_i32(b, s0, lane2);
+                } else {
+                        left = bi_clper_v7_i32(b, s0, lane1,
+                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
+                                        BI_SUBGROUP_SUBGROUP4);
+
+                        right = bi_clper_v7_i32(b, s0, lane2,
+                                        BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE,
+                                        BI_SUBGROUP_SUBGROUP4);
+                }
+
+                bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE);
+                break;
+        }
+
+        case nir_op_f2f16:
+                bi_v2f32_to_v2f16_to(b, dst, s0, s0, BI_ROUND_NONE);
+                break;
+
+        case nir_op_f2f32:
+                bi_f16_to_f32_to(b, dst, s0);
+                break;
+
+        case nir_op_f2i32:
+                if (src_sz == 32)
+                        bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
+                else
+                        bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ);
+                break;
+
+        case nir_op_f2u16:
+                if (src_sz == 32)
+                        unreachable("should've been lowered");
+                else
+                        bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ);
+                break;
+
+        case nir_op_f2i16:
+                if (src_sz == 32)
+                        unreachable("should've been lowered");
+                else
+                        bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ);
+                break;
+
+        case nir_op_f2u32:
+                if (src_sz == 32)
+                        bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
+                else
+                        bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ);
+                break;
+
+        case nir_op_u2f16:
+                if (src_sz == 32)
+                        unreachable("should've been lowered");
+                else if (src_sz == 16)
+                        bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
+                else if (src_sz == 8)
+                        bi_v2u8_to_v2f16_to(b, dst, s0);
+                break;
+
+        case nir_op_u2f32:
+                if (src_sz == 32)
+                        bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
+                else if (src_sz == 16)
+                        bi_u16_to_f32_to(b, dst, s0);
+                else
+                        bi_u8_to_f32_to(b, dst, bi_byte(s0, 0));
+                break;
+
+        case nir_op_i2f16:
+                if (src_sz == 32)
+                        unreachable("should've been lowered");
+                else
+                        bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ);
+                break;
+
+        case nir_op_i2f32:
+                if (src_sz == 32)
+                        bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ);
+                else
+                        bi_s16_to_f32_to(b, dst, s0);
+                break;
+
+        case nir_op_i2i32:
+                if (src_sz == 16)
+                        bi_s16_to_s32_to(b, dst, s0);
+                else
+                        bi_s8_to_s32_to(b, dst, s0);
+                break;
+
+        case nir_op_u2u32:
+                if (src_sz == 16)
+                        bi_u16_to_u32_to(b, dst, s0);
+                else
+                        bi_u8_to_u32_to(b, dst, s0);
+                break;
+
+        /* todo optimize out downcasts */
+        case nir_op_i2i16:
+                assert(src_sz == 8 || src_sz == 32);
+
+                if (src_sz == 8)
+                        bi_v2s8_to_v2s16_to(b, dst, s0);
+                else
+                        bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_imm_u16(0));
+                break;
+
+        case nir_op_u2u16:
+                assert(src_sz == 8 || src_sz == 32);
+
+                if (src_sz == 8)
+                        bi_v2u8_to_v2u16_to(b, dst, s0);
+                else
+                        bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_imm_u16(0));
+                break;
+
+        case nir_op_i2i8:
+        case nir_op_u2u8:
+                unreachable("should've been lowered");
+
+        case nir_op_fround_even:
+        case nir_op_fceil:
+        case nir_op_ffloor:
+        case nir_op_ftrunc:
+                bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
+                break;
+
+        case nir_op_fmin:
+                bi_fmin_to(b, sz, dst, s0, s1);
+                break;
+
+        case nir_op_fmax:
+                bi_fmax_to(b, sz, dst, s0, s1);
+                break;
+
+        case nir_op_iadd:
+                bi_iadd_to(b, sz, dst, s0, s1, false);
+                break;
+
+        case nir_op_isub:
+                bi_isub_to(b, sz, dst, s0, s1, false);
+                break;
+
+        case nir_op_imul:
+                bi_imul_to(b, sz, dst, s0, s1);
+                break;
+
+        case nir_op_iabs:
+                bi_iabs_to(b, sz, dst, s0);
+                break;
+
+        case nir_op_iand:
+                bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
+                break;
+
+        case nir_op_ior:
+                bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
+                break;
+
+        case nir_op_ixor:
+                bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
+                break;
+
+        case nir_op_inot:
+                bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
+                break;
+
+        case nir_op_frsq:
+                bi_frsq_to(b, sz, dst, s0);
+                break;
+
+        case nir_op_frcp:
+                bi_frcp_to(b, sz, dst, s0);
+                break;
+
+        default:
+                fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
+                unreachable("Unknown ALU op");
+        }
+}
+
 /* TEXS instructions assume normal 2D f32 operation but are more
  * space-efficient and with simpler RA/scheduling requirements*/