pan: Centralize preload registers

Rather than having preload registers hardcoded over multiple files, gather them in one place with an enum abstraction. This should simplify updates to the preload registers. Reviewed-by: Eric R. Smith <eric.smith@collabora.com> Reviewed-by: Lorenzo Rossi <lorenzo.rossi@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40643>
2026-05-07 04:58:05 +02:00 · 2026-03-25 16:04:01 +01:00 · 2026-03-25 16:04:01 +01:00 · 1f0370616a
commit 1f0370616a
parent 1e052f0bb5
4 changed files with 228 additions and 61 deletions
--- a/src/panfrost/compiler/bifrost/bi_ra.c
+++ b/src/panfrost/compiler/bifrost/bi_ra.c
@ -321,8 +321,9 @@ bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
 static void
 bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
                     uint64_t preload_live, unsigned node_count, bool is_blend,
-                     bool split_file, bool aligned_sr)
+                     bool split_file, unsigned arch)
 {
+   bool aligned_sr = arch >= 9;
   bi_foreach_instr_in_block_rev(block, ins) {
      /* Mark all registers live after the instruction as
       * interfering with the destination */
@ -383,8 +384,10 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
      }

      if (!is_blend && ins->op == BI_OPCODE_BLEND) {
-         /* Blend shaders might clobber r0-r15, r48. */
-         uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
+         /* Blend shaders might clobber r0-r15, blend link reg. */
+         uint64_t clobber =
+            BITFIELD64_MASK(16) |
+            BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_BLEND_LINK, arch));

         for (unsigned i = 0; i < node_count; ++i) {
            if (live[i])
@ -410,7 +413,7 @@ bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
      uint8_t *live = mem_dup(blk->live_out, ctx->ssa_alloc);

      bi_mark_interference(blk, l, live, blk->reg_live_out, ctx->ssa_alloc,
-                           ctx->inputs->is_blend, !full_regs, ctx->arch >= 9);
+                           ctx->inputs->is_blend, !full_regs, ctx->arch);

      free(live);
   }
@ -438,36 +441,43 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
      bi_foreach_dest(ins, d)
         l->affinity[ins->dest[d].value] = default_affinity;

-      /* Blend shaders expect the src colour to be in r0-r3 */
+      /* Blend shaders expect the src colour to be in blend_src0_c0
+       * through c3 */
      if (ins->op == BI_OPCODE_BLEND && !ctx->inputs->is_blend) {
         assert(bi_is_ssa(ins->src[0]));
-         l->solutions[ins->src[0].value] = 0;
+         l->solutions[ins->src[0].value] =
+            bi_preload_reg(BI_PRELOAD_BLEND_SRC0_C0, ctx->arch);

-         /* Dual source blend input in r4-r7 */
+         /* Dual source blend input in blend_src1_c0 through c3 */
         if (bi_is_ssa(ins->src[4]))
-            l->solutions[ins->src[4].value] = 4;
+            l->solutions[ins->src[4].value] =
+               bi_preload_reg(BI_PRELOAD_BLEND_SRC1_C0, ctx->arch);

-         /* Writes to R48 */
+         /* Writes to blend link */
         if (!bi_is_null(ins->dest[0]))
-            l->solutions[ins->dest[0].value] = 48;
+            l->solutions[ins->dest[0].value] =
+               bi_preload_reg(BI_PRELOAD_BLEND_LINK, ctx->arch);
      }

-      /* Coverage mask writes stay in R60 */
+      /* Coverage mask writes stay in the cumulative coverage reg */
      if ((ins->op == BI_OPCODE_ATEST || ins->op == BI_OPCODE_ZS_EMIT) &&
          !bi_is_null(ins->dest[0])) {
-         l->solutions[ins->dest[0].value] = 60;
+         l->solutions[ins->dest[0].value] =
+            bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch);
      }

      /* Experimentally, it seems coverage masks inputs to ATEST must
-       * be in R60. Otherwise coverage mask writes do not work with
-       * early-ZS with pixel-frequency-shading (this combination of
-       * settings is legal if depth/stencil writes are disabled).
-       * Allowing a FAU index also seems to work on Valhall, at least.
+       * be in the cumulative coverage reg. Otherwise coverage mask
+       * writes do not work with early-ZS with pixel-frequency-shading
+       * (this combination of settings is legal if depth/stencil
+       * writes are disabled). Allowing a FAU index also seems to
+       * work on Valhall, at least.
       */
      if (ins->op == BI_OPCODE_ATEST) {
         assert(bi_is_ssa(ins->src[0]) || ins->src[0].type == BI_INDEX_FAU);
         if (bi_is_ssa(ins->src[0]))
-            l->solutions[ins->src[0].value] = 60;
+            l->solutions[ins->src[0].value] =
+               bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch);
      }
   }

@ -492,8 +502,10 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)

      if (ctx->inputs->is_blend) {
         /* We're allowed to coalesce the moves to these */
-         affinity |= BITFIELD64_BIT(48);
-         affinity |= BITFIELD64_BIT(60);
+         affinity |=
+            BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_BLEND_LINK, ctx->arch));
+         affinity |= BITFIELD64_BIT(
+            bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch));
      }

      /* Try to coalesce */
@ -595,14 +607,15 @@ bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
   bi_foreach_instr_global(ctx, ins) {
      bi_foreach_dest(ins, d) {
         /* Don't allow spilling coverage mask writes because the
-          * register preload logic assumes it will stay in R60.
-          * This could be optimized.
+          * register preload logic assumes it will stay in the
+          * cumulative coverage reg. This could be optimized.
          */
         if (ins->no_spill || ins->op == BI_OPCODE_ATEST ||
             ins->op == BI_OPCODE_ZS_EMIT ||
             (ins->op == BI_OPCODE_MOV_I32 &&
              ins->src[0].type == BI_INDEX_REGISTER &&
-              ins->src[0].value == 60)) {
+              ins->src[0].value ==
+                 bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch))) {
            BITSET_SET(no_spill, ins->dest[d].value);
         }
      }
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@ -32,8 +32,9 @@ static void pan_stats_verbose(FILE *f, const char *prefix, bi_context *ctx,
 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);

 static bi_index
-bi_preload(bi_builder *b, unsigned reg)
+bi_preload(bi_builder *b, enum bi_preload val)
 {
+   unsigned reg = bi_preload_reg(val, b->shader->arch);
   if (bi_is_null(b->shader->preloaded[reg])) {
      /* Insert at the beginning of the shader */
      bi_builder b_ = *b;
@ -50,7 +51,7 @@ static bi_index
 bi_coverage(bi_builder *b)
 {
   if (bi_is_null(b->shader->coverage))
-      b->shader->coverage = bi_preload(b, 60);
+      b->shader->coverage = bi_preload(b, BI_PRELOAD_CUMULATIVE_COVERAGE);

   return b->shader->coverage;
 }
@ -63,20 +64,20 @@ bi_coverage(bi_builder *b)
 static inline bi_index
 bi_vertex_id(bi_builder *b)
 {
-   return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
+   return bi_preload(b, BI_PRELOAD_VERTEX_ID);
 }

 static inline bi_index
 bi_instance_id(bi_builder *b)
 {
-   return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
+   return bi_preload(b, BI_PRELOAD_INSTANCE_ID);
 }

 static inline bi_index
 bi_draw_id(bi_builder *b)
 {
   assert(b->shader->arch >= 9);
-   return bi_preload(b, 62);
+   return bi_preload(b, BI_PRELOAD_DRAW_ID);
 }

 static void
@ -258,8 +259,9 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
 {
   switch (intr->intrinsic) {
   case nir_intrinsic_load_barycentric_centroid:
+      return bi_preload(b, BI_PRELOAD_CENTROID_ID);
   case nir_intrinsic_load_barycentric_sample:
-      return bi_preload(b, 61);
+      return bi_preload(b, BI_PRELOAD_SAMPLE_ID);

   /* Need to put the sample ID in the top 16-bits */
   case nir_intrinsic_load_barycentric_at_sample:
@ -328,7 +330,8 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)

   case nir_intrinsic_load_barycentric_pixel:
   default:
-      return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
+      return b->shader->arch >= 9 ? bi_preload(b, BI_PRELOAD_CENTROID_ID)
+                                  : bi_dontcare(b);
   }
 }

@ -550,7 +553,8 @@ bi_emit_lea_attr(bi_builder *b, nir_intrinsic_instr *intr)
      unsigned snap4 = 0x5E;
      uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
      bi_collect_v3i32_to(b, bi_def_index(&intr->def),
-                          bi_preload(b, 58), bi_preload(b, 59),
+                          bi_preload(b, BI_PRELOAD_POS_RESULT_PTR_LO),
+                          bi_preload(b, BI_PRELOAD_POS_RESULT_PTR_HI),
                          bi_imm_u32(format));
      return;
   }
@ -838,8 +842,8 @@ bi_load_sample_id_to(bi_builder *b, bi_index dst)
    * seem to read garbage (despite being architecturally defined
    * as zero), so use a 5-bit mask instead of 8-bits */

-   bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
-                        bi_imm_u8(16), false);
+   bi_rshift_and_i32_to(b, dst, bi_preload(b, BI_PRELOAD_SAMPLE_ID),
+                        bi_imm_u32(0x1f), bi_imm_u8(16), false);
 }

 static bi_index
@ -872,12 +876,24 @@ static void
 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
 {
   nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
-   unsigned base = sem.dual_source_blend_index * 4;
   unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
   assert(size == 16 || size == 32);

-   bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1),
-                      bi_preload(b, base + 2), bi_preload(b, base + 3)};
+   bi_index srcs[4];
+   switch (sem.dual_source_blend_index) {
+   case 0:
+      srcs[0] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C0);
+      srcs[1] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C1);
+      srcs[2] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C2);
+      srcs[3] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C3);
+      break;
+   case 1:
+      srcs[0] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C0);
+      srcs[1] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C1);
+      srcs[2] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C2);
+      srcs[3] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C3);
+      break;
+   }

   bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2);
 }
@ -1759,7 +1775,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
      break;

   case nir_intrinsic_load_cumulative_coverage_pan:
-      bi_mov_i32_to(b, dst, bi_preload(b, 60));
+      bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_CUMULATIVE_COVERAGE));
      break;

   case nir_intrinsic_load_blend_descriptor_pan: {
@ -1841,16 +1857,15 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
   }

   case nir_intrinsic_blend_return_pan:
-      /* Jump back to the fragment shader, return address is stored
-       * in r48 (see above). On Valhall, only jump if the address is
-       * nonzero. The check is free there and it implements the "jump
-       * to 0 terminates the blend shader" that's automatic on
-       * Bifrost.
+      /* Jump back to the fragment shader. On Valhall, only jump if the address
+       * is nonzero. The check is free there and it implements the "jump to 0
+       * terminates the blend shader" that's automatic on Bifrost.
       */
      if (b->shader->arch >= 9)
-         bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
+         bi_branchzi(b, bi_preload(b, BI_PRELOAD_BLEND_LINK),
+                     bi_preload(b, BI_PRELOAD_BLEND_LINK), BI_CMPF_NE);
      else
-         bi_jump(b, bi_preload(b, 48));
+         bi_jump(b, bi_preload(b, BI_PRELOAD_BLEND_LINK));
      break;

   case nir_intrinsic_load_ubo:
@ -2008,7 +2023,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)

   case nir_intrinsic_load_pixel_coord:
      /* Vectorized load of the preloaded i16vec2 */
-      bi_mov_i32_to(b, dst, bi_preload(b, 59));
+      bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_POSITION_XY));
      break;

   case nir_intrinsic_load_texel_buf_conv_pan:
@ -2033,7 +2048,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
      break;

   case nir_intrinsic_load_idvs_output_buf_index_pan:
-      bi_mov_i32_to(b, dst, bi_preload(b, 59));
+      bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_INTERNAL_ID));
      break;

   case nir_intrinsic_lea_attr_pan:
@ -2067,8 +2082,9 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
      break;

   case nir_intrinsic_load_sample_mask_in:
-      /* r61[0:15] contains the coverage bitmap */
-      bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
+      /* [0:15] contains the coverage bitmap */
+      bi_u16_to_u32_to(
+         b, dst, bi_half(bi_preload(b, BI_PRELOAD_RASTERIZER_COVERAGE), false));
      break;

   case nir_intrinsic_load_sample_mask:
@ -2080,12 +2096,12 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
      break;

   case nir_intrinsic_load_primitive_id:
-      bi_mov_i32_to(b, dst, bi_preload(b, 57));
+      bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_PRIMITIVE_ID));
      break;

   case nir_intrinsic_load_front_face: {
-      /* (r58 & 1) == 0 means primitive is front facing */
-      bi_index primitive_facing = bi_preload(b, 58);
+      /* (primitive_flags & 1) == 0 means primitive is front facing */
+      bi_index primitive_facing = bi_preload(b, BI_PRELOAD_PRIMITIVE_FLAGS);

      /* Starting with v11, there is more fields defined in the primitive flags */
      if (b->shader->arch >= 11)
@ -2150,20 +2166,23 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
   }

   case nir_intrinsic_load_local_invocation_id:
-      bi_collect_v3i32_to(b, dst,
-                          bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
-                          bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
-                          bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
+      bi_collect_v3i32_to(
+         b, dst,
+         bi_u16_to_u32(b, bi_half(bi_preload(b, BI_PRELOAD_LOCAL_ID_0), 0)),
+         bi_u16_to_u32(b, bi_half(bi_preload(b, BI_PRELOAD_LOCAL_ID_1), 1)),
+         bi_u16_to_u32(b, bi_half(bi_preload(b, BI_PRELOAD_LOCAL_ID_2), 0)));
      break;

   case nir_intrinsic_load_workgroup_id:
-      bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
-                          bi_preload(b, 59));
+      bi_collect_v3i32_to(b, dst, bi_preload(b, BI_PRELOAD_WORKGROUP_ID_0),
+                          bi_preload(b, BI_PRELOAD_WORKGROUP_ID_1),
+                          bi_preload(b, BI_PRELOAD_WORKGROUP_ID_2));
      break;

   case nir_intrinsic_load_global_invocation_id:
-      bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
-                          bi_preload(b, 62));
+      bi_collect_v3i32_to(b, dst, bi_preload(b, BI_PRELOAD_GLOBAL_ID_0),
+                          bi_preload(b, BI_PRELOAD_GLOBAL_ID_1),
+                          bi_preload(b, BI_PRELOAD_GLOBAL_ID_2));
      break;

   case nir_intrinsic_shader_clock:
@ -2190,7 +2209,9 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
   case nir_intrinsic_load_view_index:
   case nir_intrinsic_load_layer_id:
      assert(b->shader->arch >= 9);
-      bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0)));
+      bi_mov_i32_to(
+         b, dst,
+         bi_u8_to_u32(b, bi_byte(bi_preload(b, BI_PRELOAD_FRAME_ARG), 0)));
      break;

   case nir_intrinsic_load_ssbo_address:
--- a/src/panfrost/compiler/bifrost/compiler.h
+++ b/src/panfrost/compiler/bifrost/compiler.h
@ -1116,6 +1116,137 @@ enum bi_idvs_mode {

 #define BI_MAX_REGS 64

+enum bi_preload {
+   /* Compute */
+   BI_PRELOAD_LOCAL_ID_0,
+   BI_PRELOAD_LOCAL_ID_1,
+   BI_PRELOAD_LOCAL_ID_2,
+   BI_PRELOAD_WORKGROUP_ID_0,
+   BI_PRELOAD_WORKGROUP_ID_1,
+   BI_PRELOAD_WORKGROUP_ID_2,
+   BI_PRELOAD_GLOBAL_ID_0,
+   BI_PRELOAD_GLOBAL_ID_1,
+   BI_PRELOAD_GLOBAL_ID_2,
+   /* Vertex */
+   BI_PRELOAD_POS_RESULT_PTR_LO,
+   BI_PRELOAD_POS_RESULT_PTR_HI,
+   BI_PRELOAD_INTERNAL_ID,
+   BI_PRELOAD_VERTEX_ID,
+   BI_PRELOAD_INSTANCE_ID,
+   BI_PRELOAD_DRAW_ID,
+   BI_PRELOAD_VIEW_ID,
+   /* Fragment */
+   BI_PRELOAD_PRIMITIVE_ID,
+   BI_PRELOAD_PRIMITIVE_FLAGS,
+   BI_PRELOAD_POSITION_XY,
+   BI_PRELOAD_CUMULATIVE_COVERAGE,
+   BI_PRELOAD_RASTERIZER_COVERAGE,
+   BI_PRELOAD_SAMPLE_ID,
+   BI_PRELOAD_CENTROID_ID,
+   BI_PRELOAD_FRAME_ARG,
+   /* Blend */
+   BI_PRELOAD_BLEND_SRC0_C0,
+   BI_PRELOAD_BLEND_SRC0_C1,
+   BI_PRELOAD_BLEND_SRC0_C2,
+   BI_PRELOAD_BLEND_SRC0_C3,
+   BI_PRELOAD_BLEND_SRC1_C0,
+   BI_PRELOAD_BLEND_SRC1_C1,
+   BI_PRELOAD_BLEND_SRC1_C2,
+   BI_PRELOAD_BLEND_SRC1_C3,
+   BI_PRELOAD_BLEND_LINK,
+};
+
+static inline unsigned
+bi_preload_reg(enum bi_preload val, unsigned arch)
+{
+   switch (val) {
+   /* Compute */
+   case BI_PRELOAD_LOCAL_ID_0:
+      /* Bits [15;0] */
+      return 55;
+   case BI_PRELOAD_LOCAL_ID_1:
+      /* Bits [31;16] */
+      return 55;
+   case BI_PRELOAD_LOCAL_ID_2:
+      /* Bits [15;0] */
+      return 56;
+   case BI_PRELOAD_WORKGROUP_ID_0:
+      return 57;
+   case BI_PRELOAD_WORKGROUP_ID_1:
+      return 58;
+   case BI_PRELOAD_WORKGROUP_ID_2:
+      return 59;
+   case BI_PRELOAD_GLOBAL_ID_0:
+      return 60;
+   case BI_PRELOAD_GLOBAL_ID_1:
+      return 61;
+   case BI_PRELOAD_GLOBAL_ID_2:
+      return 62;
+   /* Vertex */
+   case BI_PRELOAD_POS_RESULT_PTR_LO:
+      assert(arch < 9);
+      return 58;
+   case BI_PRELOAD_POS_RESULT_PTR_HI:
+      assert(arch < 9);
+      return 59;
+   case BI_PRELOAD_INTERNAL_ID:
+      assert(arch >= 9);
+      return 59;
+   case BI_PRELOAD_VERTEX_ID:
+      return (arch >= 9) ? 60 : 61;
+   case BI_PRELOAD_INSTANCE_ID:
+      return (arch >= 9) ? 61 : 62;
+   case BI_PRELOAD_DRAW_ID:
+      assert(arch >= 9);
+      return 62;
+   case BI_PRELOAD_VIEW_ID:
+      assert(arch >= 9);
+      return 63;
+   /* Fragment */
+   case BI_PRELOAD_PRIMITIVE_ID:
+      return 57;
+   case BI_PRELOAD_PRIMITIVE_FLAGS:
+      return 58;
+   case BI_PRELOAD_POSITION_XY:
+      return 59;
+   case BI_PRELOAD_CUMULATIVE_COVERAGE:
+      /* Bits [15;0] */
+      return 60;
+   case BI_PRELOAD_RASTERIZER_COVERAGE:
+      /* Bits [15;0] */
+      return 61;
+   case BI_PRELOAD_SAMPLE_ID:
+      /* Bits [23;16] */
+      return 61;
+   case BI_PRELOAD_CENTROID_ID:
+      /* Bits [31;24] */
+      return 61;
+   case BI_PRELOAD_FRAME_ARG:
+      /* Double reg */
+      return 62;
+   /* Blend */
+   case BI_PRELOAD_BLEND_SRC0_C0:
+      return 0;
+   case BI_PRELOAD_BLEND_SRC0_C1:
+      return 1;
+   case BI_PRELOAD_BLEND_SRC0_C2:
+      return 2;
+   case BI_PRELOAD_BLEND_SRC0_C3:
+      return 3;
+   case BI_PRELOAD_BLEND_SRC1_C0:
+      return 4;
+   case BI_PRELOAD_BLEND_SRC1_C1:
+      return 5;
+   case BI_PRELOAD_BLEND_SRC1_C2:
+      return 6;
+   case BI_PRELOAD_BLEND_SRC1_C3:
+      return 7;
+   case BI_PRELOAD_BLEND_LINK:
+      return 48;
+   }
+   UNREACHABLE("Non-handled BI_PRELOAD");
+}
+
 typedef struct {
   const struct pan_compile_inputs *inputs;
   nir_shader *nir;
--- a/src/panfrost/compiler/bifrost/valhall/va_pack.c
+++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c
@ -1155,8 +1155,10 @@ va_lower_blend(bi_context *ctx)

      unsigned prolog_length = 2 * 8;

-      /* By ABI, r48 is the link register shared with blend shaders */
-      assert(bi_is_equiv(I->dest[0], bi_register(48)));
+      /* By ABI, the preload blend link register is shared with blend
+       * shaders */
+      assert(bi_is_equiv(I->dest[0], bi_register(bi_preload_reg(
+                                        BI_PRELOAD_BLEND_LINK, ctx->arch))));

      if (I->flow == VA_FLOW_END)
         bi_iadd_imm_i32_to(&b, I->dest[0], va_zero_lut(), 0);