From 3df5446cbd460aee6c02c570ecbc285102abc9f0 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@collabora.com>
Date: Wed, 11 May 2022 15:39:56 -0400
Subject: [PATCH] pan/bi: Simplify register precolouring in the IR

In the current IR, any register may be preloaded by reading it anywhere, and any
register may be precoloured by writing it anywhere. This is convenient for
instruction selection, but requires the register allocator to do considerable
gymnastics to ensure it doesn't clobber precoloured registers. It also breaks
the purity of our SSA representation, which complicates optimization passes
(e.g. copyprop).

Let's trade some instruction selection complexity for simplifying register
allocation by constraining how register precolouring works. Under the new model:

* Registers may only be preloaded at the start of the program.
* Precoloured destinations are handled explicitly by RA.

Internally, a stronger invariant is placed for preloading: registers may only be
preloaded by MOV.i32 instructions at the beginning of the block, and these moves
must be unique. These invariants ensure RA can trivially coalesce the moves.

A bi_preload helper is added as a safe version of bi_register respecting these
invariants, allowing a smooth transition for instruction selection.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16585>
---
 src/panfrost/bifrost/bi_opt_dce.c             |   5 +-
 src/panfrost/bifrost/bi_opt_message_preload.c |  16 ++-
 src/panfrost/bifrost/bi_ra.c                  |  46 +++++++
 src/panfrost/bifrost/bifrost_compile.c        | 129 ++++++++++++------
 src/panfrost/bifrost/compiler.h               |  28 ++--
 .../bifrost/test/test-message-preload.cpp     |   5 +-
 6 files changed, 163 insertions(+), 66 deletions(-)

diff --git a/src/panfrost/bifrost/bi_opt_dce.c b/src/panfrost/bifrost/bi_opt_dce.c
index dc725d0d8e5..b0901bb2f71 100644
--- a/src/panfrost/bifrost/bi_opt_dce.c
+++ b/src/panfrost/bifrost/bi_opt_dce.c
@@ -53,7 +53,10 @@ bi_opt_dead_code_eliminate(bi_context *ctx)
                                 if (ins->op == BI_OPCODE_AXCHG_I32 ||
                                     ins->op == BI_OPCODE_ACMPXCHG_I32 ||
                                     ins->op == BI_OPCODE_ATOM_RETURN_I32 ||
-                                    ins->op == BI_OPCODE_ATOM1_RETURN_I32)
+                                    ins->op == BI_OPCODE_ATOM1_RETURN_I32 ||
+                                    ins->op == BI_OPCODE_BLEND ||
+                                    ins->op == BI_OPCODE_ATEST ||
+                                    ins->op == BI_OPCODE_ZS_EMIT)
                                         continue;
 
                                 if (index < temp_count && !(live[index] & bi_writemask(ins, d)))
diff --git a/src/panfrost/bifrost/bi_opt_message_preload.c b/src/panfrost/bifrost/bi_opt_message_preload.c
index c8632ca52bf..3a261019e25 100644
--- a/src/panfrost/bifrost/bi_opt_message_preload.c
+++ b/src/panfrost/bifrost/bi_opt_message_preload.c
@@ -124,13 +124,23 @@ bi_opt_message_preload(bi_context *ctx)
                 /* Report the preloading */
                 ctx->info.bifrost->messages[nr_preload] = msg;
 
-                /* Replace with moves at the start. Ideally, they will be
-                 * coalesced out or copy propagated.
+                /* Replace with a collect of preloaded registers. The collect
+                 * kills the moves, so the collect is free (it is coalesced).
                  */
+                b.cursor = bi_before_instr(I);
+
                 bi_instr *collect = bi_collect_i32_to(&b, I->dest[0]);
                 collect->nr_srcs = bi_count_write_registers(I, 0);
+
+                /* The registers themselves must be preloaded at the start of
+                 * the program. Preloaded registers are coalesced, so these
+                 * moves are free.
+                 */
+                b.cursor = bi_before_block(block);
                 for (unsigned i = 0; i < collect->nr_srcs; ++i) {
-                        collect->src[i] = bi_register((nr_preload * 4) + i);
+                        unsigned reg = (nr_preload * 4) + i;
+
+                        collect->src[i] = bi_mov_i32(&b, bi_register(reg));
                 }
 
                 bi_remove_instruction(I);
diff --git a/src/panfrost/bifrost/bi_ra.c b/src/panfrost/bifrost/bi_ra.c
index 8b088f5e5a7..9dbe5ccbc85 100644
--- a/src/panfrost/bifrost/bi_ra.c
+++ b/src/panfrost/bifrost/bi_ra.c
@@ -352,11 +352,57 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
                         node = bi_get_node(ins->src[4]);
                         if (node < node_count)
                                 l->solutions[node] = 4;
+
+                        /* Writes to R48 */
+                        node = bi_get_node(ins->dest[0]);
+                        if (!bi_is_null(ins->dest[0])) {
+                                assert(node < node_count);
+                                l->solutions[node] = 48;
+                        }
+                }
+
+                /* Coverage mask writes stay in R60 */
+                if ((ins->op == BI_OPCODE_ATEST ||
+                     ins->op == BI_OPCODE_ZS_EMIT) &&
+                    !bi_is_null(ins->dest[0])) {
+                        unsigned node = bi_get_node(ins->dest[0]);
+                        assert(node < node_count);
+                        l->solutions[node] = 60;
                 }
         }
 
         bi_compute_interference(ctx, l, full_regs);
 
+        /* Coalesce register moves if we're allowed. We need to be careful due
+         * to the restricted affinity induced by the blend shader ABI.
+         */
+        bi_foreach_instr_global(ctx, I) {
+                if (I->op != BI_OPCODE_MOV_I32) continue;
+                if (I->src[0].type != BI_INDEX_REGISTER) continue;
+
+                unsigned reg = I->src[0].value;
+                unsigned node = bi_get_node(I->dest[0]);
+                assert(node < node_count);
+
+                if (l->solutions[node] != ~0) continue;
+
+                uint64_t affinity = l->affinity[node];
+
+                if (ctx->inputs->is_blend) {
+                        /* We're allowed to coalesce the moves to these */
+                        affinity |= BITFIELD64_BIT(48);
+                        affinity |= BITFIELD64_BIT(60);
+                }
+
+                /* Try to coalesce */
+                if (affinity & BITFIELD64_BIT(reg)) {
+                        l->solutions[node] = reg;
+
+                        if (!lcra_test_linear(l, l->solutions, node))
+                                l->solutions[node] = ~0;
+                }
+        }
+
         *success = lcra_solve(l);
 
         return l;
diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 1732a7922a7..ea3f35c5885 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -72,6 +72,47 @@ int bifrost_debug = 0;
 
 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
 
+static bi_index
+bi_preload(bi_builder *b, unsigned reg)
+{
+        if (bi_is_null(b->shader->preloaded[reg])) {
+                /* Insert at the beginning of the shader */
+                bi_builder b_ = *b;
+                b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
+
+                /* Cache the result */
+                b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
+        }
+
+        return b->shader->preloaded[reg];
+}
+
+static bi_index
+bi_coverage(bi_builder *b)
+{
+        if (bi_is_null(b->shader->coverage))
+                b->shader->coverage = bi_preload(b, 60);
+
+        return b->shader->coverage;
+}
+
+/*
+ * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
+ * changed from Bifrost to Valhall. Provide helpers that smooth over the
+ * architectural difference.
+ */
+static inline bi_index
+bi_vertex_id(bi_builder *b)
+{
+        return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
+}
+
+static inline bi_index
+bi_instance_id(bi_builder *b)
+{
+        return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
+}
+
 static void
 bi_block_add_successor(bi_block *block, bi_block *successor)
 {
@@ -269,7 +310,7 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
         switch (intr->intrinsic) {
         case nir_intrinsic_load_barycentric_centroid:
         case nir_intrinsic_load_barycentric_sample:
-                return bi_register(61);
+                return bi_preload(b, 61);
 
         /* Need to put the sample ID in the top 16-bits */
         case nir_intrinsic_load_barycentric_at_sample:
@@ -314,7 +355,7 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
 
         case nir_intrinsic_load_barycentric_pixel:
         default:
-                return b->shader->arch >= 9 ? bi_register(61) : bi_dontcare(b);
+                return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
         }
 }
 
@@ -503,7 +544,7 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
                  * logically unused for flat varyings
                  */
                 if (b->shader->arch >= 9)
-                        src0 = bi_register(61);
+                        src0 = bi_preload(b, 61);
         }
 
         nir_src *offset = nir_get_io_offset_src(instr);
@@ -676,7 +717,7 @@ bi_load_sample_id_to(bi_builder *b, bi_index dst)
          * seem to read garbage (despite being architecturally defined
          * as zero), so use a 5-bit mask instead of 8-bits */
 
-        bi_rshift_and_i32_to(b, dst, bi_register(61), bi_imm_u32(0x1f),
+        bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
                                 bi_imm_u8(16), false);
 }
 
@@ -710,25 +751,23 @@ bi_pixel_indices(bi_builder *b, unsigned rt)
         return indices;
 }
 
+/* Source color is passed through r0-r3, or r4-r7 for the second source when
+ * dual-source blending. Preload the corresponding vector.
+ */
 static void
 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
 {
-        ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
+        nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
+        unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
+        unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
+        assert(size == 16 || size == 32);
 
-        /* Source color is passed through r0-r3, or r4-r7 for the second
-         * source when dual-source blending.  TODO: Precolour instead */
         bi_index srcs[] = {
-                bi_register(0), bi_register(1), bi_register(2), bi_register(3)
-        };
-        bi_index srcs2[] = {
-                bi_register(4), bi_register(5), bi_register(6), bi_register(7)
+                bi_preload(b, base + 0), bi_preload(b, base + 1),
+                bi_preload(b, base + 2), bi_preload(b, base + 3)
         };
 
-        bool second_source = (sem.location == VARYING_SLOT_VAR0);
-
-        bi_make_vec_to(b, bi_dest_index(&instr->dest),
-                       second_source ? srcs2 : srcs,
-                       NULL, 4, 32);
+        bi_emit_collect_to(b, bi_dest_index(&instr->dest), srcs, size == 32 ? 4 : 2);
 }
 
 static void
@@ -755,7 +794,7 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
         if (inputs->is_blend && inputs->blend.nr_samples > 1) {
                 /* Conversion descriptor comes from the compile inputs, pixel
                  * indices derived at run time based on sample ID */
-                bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_register(60),
+                bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
                                 bi_imm_u32(blend_desc >> 32),
                                 regfmt, BI_VECSIZE_V4);
         } else if (b->shader->inputs->is_blend) {
@@ -764,8 +803,8 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
                 /* Blend descriptor comes from the compile inputs */
                 /* Put the result in r0 */
 
-                bi_blend_to(b, bifrost ? bi_register(0) : bi_null(), rgba,
-                                bi_register(60),
+                bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba,
+                                bi_coverage(b),
                                 bi_imm_u32(blend_desc),
                                 bi_imm_u32(blend_desc >> 32),
                                 bi_null(), regfmt, sr_count, 0);
@@ -774,8 +813,8 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T,
                  * return address on Bifrost is stored in r48 and will be used
                  * by the blend shader to jump back to the fragment shader */
 
-                bi_blend_to(b, bifrost ? bi_register(48) : bi_null(), rgba,
-                                bi_register(60),
+                bi_blend_to(b, bifrost ? bi_temp(b->shader) : bi_null(), rgba,
+                                bi_coverage(b),
                                 bi_fau(BIR_FAU_BLEND_0 + rt, false),
                                 bi_fau(BIR_FAU_BLEND_0 + rt, true),
                                 rgba2, regfmt, sr_count, sr_count_2);
@@ -809,9 +848,9 @@ bi_emit_atest(bi_builder *b, bi_index alpha)
                 I->flow = 0x8; /* .wait0126 */
         }
 
-        bi_index coverage = bi_register(60);
-        bi_instr *atest = bi_atest_to(b, coverage, coverage, alpha);
+        bi_instr *atest = bi_atest_to(b, bi_temp(b->shader), bi_coverage(b), alpha);
         b->shader->emitted_atest = true;
+        b->shader->coverage = atest->dest[0];
 
         /* Pseudo-source to encode in the tuple */
         atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false);
@@ -845,10 +884,12 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
         /* By ISA convention, the coverage mask is stored in R60. The store
          * itself will be handled by a subsequent ATEST instruction */
         if (loc == FRAG_RESULT_SAMPLE_MASK) {
-                bi_index orig = bi_register(60);
+                bi_index orig = bi_coverage(b);
                 bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0);
-                bi_index new = bi_lshift_and_i32(b, orig, src0, bi_imm_u8(0));
-                bi_mux_i32_to(b, orig, orig, new, msaa, BI_MUX_INT_ZERO);
+                bi_index new = bi_lshift_and_i32(b, orig, bi_extract(b, src0, 0), bi_imm_u8(0));
+
+                b->shader->coverage =
+                        bi_mux_i32(b, orig, new, msaa, BI_MUX_INT_ZERO);
                 return;
         }
 
@@ -882,9 +923,9 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
                 if (writeout & PAN_WRITEOUT_S)
                         s = bi_src_index(&instr->src[3]);
 
-                bi_zs_emit_to(b, bi_register(60), z, s, bi_register(60),
-                                writeout & PAN_WRITEOUT_S,
-                                writeout & PAN_WRITEOUT_Z);
+                b->shader->coverage = bi_zs_emit(b, z, s, bi_coverage(b),
+                                                 writeout & PAN_WRITEOUT_S,
+                                                 writeout & PAN_WRITEOUT_Z);
         }
 
         if (emit_blend) {
@@ -923,9 +964,9 @@ bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
                  * Bifrost.
                  */
                 if (b->shader->arch >= 8)
-                        bi_branchzi(b, bi_register(48), bi_register(48), BI_CMPF_NE);
+                        bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
                 else
-                        bi_jump(b, bi_register(48));
+                        bi_jump(b, bi_preload(b, 48));
         }
 }
 
@@ -1032,10 +1073,10 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
                 unsigned snap4 = 0x5E;
                 uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
 
-                bi_st_cvt(b, data, bi_register(58), bi_register(59),
+                bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
                           bi_imm_u32(format), regfmt, nr - 1);
         } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
-                bi_index index = bi_register(59);
+                bi_index index = bi_preload(b, 59);
 
                 if (psiz) {
                         assert(T_size == 16 && "should've been lowered");
@@ -1487,7 +1528,7 @@ bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr)
 
         for (unsigned i = 0; i < 2; ++i) {
                 src[i] = bi_fadd_f32(b,
-                                bi_u16_to_f32(b, bi_half(bi_register(59), i)),
+                                bi_u16_to_f32(b, bi_half(bi_preload(b, 59), i)),
                                 bi_imm_f32(0.5f));
         }
 
@@ -1534,7 +1575,7 @@ bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
                 I->flow = 0x9; /* .wait */
         }
 
-        bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_register(60), desc,
+        bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b), desc,
                       regfmt, nr - 1);
         bi_emit_cached_split(b, dest, size * nr);
 }
@@ -1799,7 +1840,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
 
 	case nir_intrinsic_load_sample_mask_in:
                 /* r61[0:15] contains the coverage bitmap */
-                bi_u16_to_u32_to(b, dst, bi_half(bi_register(61), false));
+                bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
                 break;
 
         case nir_intrinsic_load_sample_id:
@@ -1808,7 +1849,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
 
 	case nir_intrinsic_load_front_face:
                 /* r58 == 0 means primitive is front facing */
-                bi_icmp_i32_to(b, dst, bi_register(58), bi_zero(), BI_CMPF_EQ,
+                bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
                                 BI_RESULT_TYPE_M1);
                 break;
 
@@ -1848,20 +1889,20 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
 
         case nir_intrinsic_load_local_invocation_id:
                 bi_collect_v3i32_to(b, dst,
-                                    bi_u16_to_u32(b, bi_half(bi_register(55), 0)),
-                                    bi_u16_to_u32(b, bi_half(bi_register(55), 1)),
-                                    bi_u16_to_u32(b, bi_half(bi_register(56), 0)));
+                                    bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
+                                    bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
+                                    bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
                 break;
 
         case nir_intrinsic_load_workgroup_id:
-                bi_collect_v3i32_to(b, dst, bi_register(57), bi_register(58),
-                                    bi_register(59));
+                bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
+                                    bi_preload(b, 59));
                 break;
 
         case nir_intrinsic_load_global_invocation_id:
         case nir_intrinsic_load_global_invocation_id_zero_base:
-                bi_collect_v3i32_to(b, dst, bi_register(60), bi_register(61),
-                                    bi_register(62));
+                bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
+                                    bi_preload(b, 62));
                 break;
 
         case nir_intrinsic_shader_clock:
diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h
index 442408ff382..1636492cc6f 100644
--- a/src/panfrost/bifrost/compiler.h
+++ b/src/panfrost/bifrost/compiler.h
@@ -748,6 +748,17 @@ typedef struct {
        bi_block *continue_block;
        bool emitted_atest;
 
+       /* During NIR->BIR, the coverage bitmap. If this is NULL, the default
+        * coverage bitmap should be source from preloaded register r60. This is
+        * written by ATEST and ZS_EMIT
+        */
+       bi_index coverage;
+
+       /* During NIR->BIR, table of preloaded registers, or NULL if never
+        * preloaded.
+        */
+       bi_index preloaded[64];
+
        /* For creating temporaries */
        unsigned ssa_alloc;
        unsigned reg_alloc;
@@ -1329,23 +1340,6 @@ bi_dontcare(bi_builder *b)
                return bi_passthrough(BIFROST_SRC_FAU_HI);
 }
 
-/*
- * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
- * changed from Bifrost to Valhall. Provide helpers that smooth over the
- * architectural difference.
- */
-static inline bi_index
-bi_vertex_id(bi_builder *b)
-{
-        return bi_register((b->shader->arch >= 9) ? 60 : 61);
-}
-
-static inline bi_index
-bi_instance_id(bi_builder *b)
-{
-        return bi_register((b->shader->arch >= 9) ? 61 : 62);
-}
-
 #define bi_worklist_init(ctx, w) u_worklist_init(w, ctx->num_blocks, ctx)
 #define bi_worklist_push_head(w, block) u_worklist_push_head(w, block, index)
 #define bi_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index)
diff --git a/src/panfrost/bifrost/test/test-message-preload.cpp b/src/panfrost/bifrost/test/test-message-preload.cpp
index 71a31c18490..bbd09afa685 100644
--- a/src/panfrost/bifrost/test/test-message-preload.cpp
+++ b/src/panfrost/bifrost/test/test-message-preload.cpp
@@ -83,8 +83,11 @@ protected:
       bi_instr *I = bi_collect_i32_to(b, dest);
       I->nr_srcs = count;
 
+      b->cursor = bi_before_block(bi_start_block(&b->shader->blocks));
       for (int i = 0; i < count; ++i)
-         I->src[i] = bi_register(idx*4 + i);
+         I->src[i] = bi_mov_i32(b, bi_register(idx*4 + i));
+
+      b->cursor = bi_after_instr(I);
    }
 };