jay: Move render target store payload/descriptor construction to backend

Constructing the render target store payload is more complex than we can reasonably handle at the NIR level. The main reason is that samplemask and stencil are packed 16-bit and 8-bit parameters, respectively, which are intermixed with other values that are 32-bit. In SIMD32 mode, the packed sub-32-bit values take up fewer registers than normal values. Currently we also don't specialize the NIR for each FS dispatch width, and we can't construct the message descriptor without knowing it. So, we alter nir_intrinsic_store_render_target_intel to take each of the expected parameters - colour, depth, stencil, samplemask, src0_alpha, and discard predicate. We construct the payloads and descriptors in the backend. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41688>
2026-05-23 23:48:18 +02:00 · 2026-04-16 11:04:46 -07:00 · 2026-04-16 11:04:46 -07:00 · b01d286083
commit b01d286083
parent bc22a37d98
3 changed files with 86 additions and 86 deletions
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -383,10 +383,6 @@ index("bool", "explicit_coord")
 index("bool", "src_is_reg")
 index("bool", "dst_is_reg")

-# For an Intel render target store, whether this signals end-of-thread. Must be
-# the last instruction.
-index("bool", "eot")
-
 # The index of the format string used by a printf. (u_printf_info element of the shader)
 index("unsigned", "fmt_idx")
 # for NV coop matrix - num of matrix in load 1/2/4
@ -2665,8 +2661,8 @@ intrinsic("load_reloc_const_intel", dest_comp=1, bit_sizes=[32],
          indices=[PARAM_IDX, BASE], flags=[CAN_ELIMINATE, CAN_REORDER])

 # Write a render target
-# src[] = { payload, 2x32 descriptor, predicate }
-intrinsic("store_render_target_intel", [-1, 2, 1], indices=[EOT], bit_sizes=[32])
+# src[] = { color, src0_alpha, omask, depth, stencil, predicate }
+intrinsic("store_render_target_intel", [4, 1, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 16, 32, 32, 1])

 # Shuffle with an offset in bytes instead of a lane index.
 # src[] = { payload, lane offset in bytes }
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@ -750,34 +750,88 @@ jay_emit_derivative(jay_builder *b,
           jay_negate(jay_QUAD_SWIZZLE_u32(b, val, swz0)));
 }

+static inline jay_def
+optional_src(nir_src nsrc)
+{
+   return nir_src_is_undef(nsrc) ? jay_null() : nj_src(nsrc);
+}
+
+static bool
+scalars_equal(nir_scalar a, nir_scalar b)
+{
+   return nir_scalar_equal(a, b) ||
+          (nir_scalar_is_const(a) &&
+           nir_scalar_is_const(b) &&
+           nir_scalar_as_uint(a) == nir_scalar_as_uint(b));
+}
+
 static void
 jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr)
 {
-   jay_def data = nj_src(intr->src[0]);
-   jay_def srcs[8];
+   const struct intel_device_info *devinfo = b->shader->devinfo;
+   jay_def colour = nj_src(intr->src[0]);
+   jay_def src0_alpha = optional_src(intr->src[1]);
+   jay_def omask = optional_src(intr->src[2]);
+   jay_def depth = optional_src(intr->src[3]);
+   jay_def stencil = optional_src(intr->src[4]);
+   const bool null_rt = ((signed) nir_intrinsic_target(intr)) < 0;
+   const int target = MAX2(((signed) nir_intrinsic_target(intr)), 0);
+   const bool last = !nir_instr_next(&intr->instr);

-   /* Optimize unconditional discards. Should probably do this in NIR. */
-   bool trivial =
-      nir_src_is_const(intr->src[2]) && nir_src_as_bool(intr->src[2]);
+   /* If our alpha happens to match src0_alpha, we can skip sending it,
+    * as the hardware will use our alpha in that case.
+    */
+   if (scalars_equal(nir_scalar_resolved(intr->src[1].ssa, 0),
+                     nir_scalar_resolved(intr->src[0].ssa, 3)))
+      src0_alpha = jay_null();

-   for (unsigned i = 0; i < nir_src_num_components(intr->src[0]); ++i) {
-      srcs[i] =
-         trivial ? jay_UNDEF_u32(b) : jay_as_gpr(b, jay_extract(data, i));
+   unsigned op = b->shader->dispatch_width == 32 ?
+                    XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
+                    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+   uint64_t desc =
+      brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
+
+   uint64_t ex_desc = (target << 21) |
+                      (null_rt ? (1 << 20) : 0) |
+                      (jay_is_null(src0_alpha) ? 0 : (1 << 15)) |
+                      (jay_is_null(stencil) ? 0 : (1 << 14)) |
+                      (jay_is_null(depth) ? 0 : (1 << 13)) |
+                      (jay_is_null(omask) ? 0 : (1 << 12));
+
+   assert((jay_is_null(src0_alpha) || jay_is_null(omask)) &&
+          "TODO: lower alpha test to discards when samplemask is written");
+
+   jay_def srcs[4 + 16 + 4 + 1 + 16];
+
+   unsigned len = 0;
+
+   if (!jay_is_null(src0_alpha))
+      srcs[len++] = jay_as_gpr(b, src0_alpha);
+
+   assert(jay_is_null(omask) && "TODO: samplemask");
+
+   for (unsigned i = 0; i < 4; i++)
+      srcs[len++] = jay_as_gpr(b, jay_extract(colour, i));
+
+   if (!jay_is_null(depth))
+      srcs[len++] = jay_as_gpr(b, depth);
+
+   assert(jay_is_null(stencil) && "TODO: stencil");
+
+   /* Optimize out unconditional discards (probably should do this in NIR) */
+   if (nir_src_is_const(intr->src[5]) && nir_src_as_bool(intr->src[5])) {
+      for (unsigned i = 0; i < len; i++)
+         srcs[i] = jay_UNDEF_u32(b);
   }

   jay_inst *send =
      jay_SEND(b, .sfid = BRW_SFID_RENDER_CACHE, .check_tdr = true,
-               .msg_desc = nir_scalar_as_uint(nir_scalar_chase_movs(
-                              nir_get_scalar(intr->src[1].ssa, 0))) |
-                           (nir_scalar_as_uint(nir_scalar_chase_movs(
-                               nir_get_scalar(intr->src[1].ssa, 1)))
-                            << 32),
-               .srcs = srcs, .nr_srcs = nir_src_num_components(intr->src[0]),
-               .type = JAY_TYPE_U32, .eot = nir_intrinsic_eot(intr));
+               .msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len,
+               .type = JAY_TYPE_U32, .eot = last);

   /* Handle the disable predicate. It is logically inverted. */
-   if (!nir_src_is_const(intr->src[2]) || nir_src_as_bool(intr->src[2])) {
-      jay_add_predicate(b, send, jay_negate(nj_src(intr->src[2])));
+   if (!nir_src_is_const(intr->src[5]) || nir_src_as_bool(intr->src[5])) {
+      jay_add_predicate(b, send, jay_negate(nj_src(intr->src[5])));
   }
 }

--- a/src/intel/compiler/jay/jay_nir.c
+++ b/src/intel/compiler/jay/jay_nir.c
@ -166,43 +166,19 @@ collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_)
   nir_instr_remove(&intr->instr);
   return true;
 }
-
-static void
-append_payload(nir_builder *b,
-               nir_def **payload,
-               unsigned *len,
-               unsigned max_len,
-               nir_def *value)
-{
-   if (value != NULL) {
-      for (unsigned i = 0; i < value->num_components; ++i) {
-         payload[*len] = nir_channel(b, value, i);
-         (*len)++;
-         assert((*len) <= max_len);
-      }
-   }
-}
-
 static void
 insert_rt_store(nir_builder *b,
-                const struct intel_device_info *devinfo,
                signed target,
-                bool last,
                nir_def *colour,
-                nir_def *src0_alpha,
+                nir_def *src0_colour,
                nir_def *depth,
                nir_def *stencil,
-                nir_def *sample_mask,
-                unsigned dispatch_width)
+                nir_def *sample_mask)
 {
   bool null_rt = target < 0;
   target = MAX2(target, 0);

-   if (!colour) {
-      colour = nir_undef(b, 4, 32);
-   }
-
-   colour = nir_pad_vec4(b, colour);
+   colour = nir_pad_vec4(b, colour ?: nir_undef(b, 4, 32));

   if (null_rt) {
      /* Even if we don't write a RT, we still need to write alpha for
@ -212,42 +188,14 @@ insert_rt_store(nir_builder *b,
                                     nir_channel(b, colour, 3), 3);
   }

-   /* TODO: Not sure I like this. We'll see what 2src looks like. */
-   unsigned op = dispatch_width == 32 ?
-                    XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
-                    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
-   uint64_t desc =
-      brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
-
-   uint64_t ex_desc = 0;
-   if (devinfo->ver >= 20) {
-      ex_desc = target << 21 |
-                null_rt << 20 |
-                (src0_alpha ? (1 << 15) : 0) |
-                (stencil ? (1 << 14) : 0) |
-                (depth ? (1 << 13) : 0) |
-                (sample_mask ? (1 << 12) : 0);
-   } else if (devinfo->ver >= 11) {
-      /* Set the "Render Target Index" and "Src0 Alpha Present" fields
-       * in the extended message descriptor, in lieu of using a header.
-       */
-      ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0);
-   }
-
-   /* Build the payload */
-   nir_def *payload[8] = { NULL };
-   unsigned len = 0;
-   append_payload(b, payload, &len, ARRAY_SIZE(payload), colour);
-   append_payload(b, payload, &len, ARRAY_SIZE(payload), depth);
-   /* TODO */
+   nir_def *src0_alpha = nir_channel_or_undef(b, src0_colour ?: colour, 3);

   nir_def *disable = b->shader->info.fs.uses_discard ?
                         nir_is_helper_invocation(b, 1) :
                         nir_imm_false(b);

-   nir_store_render_target_intel(b, nir_vec(b, payload, len),
-                                 nir_imm_ivec2(b, desc, ex_desc), disable,
-                                 .eot = last);
+   nir_store_render_target_intel(b, colour, src0_alpha, sample_mask, depth,
+                                 stencil, disable, .target = target);
 }

 static void
@ -271,16 +219,18 @@ lower_fragment_outputs(nir_function_impl *impl,
      }
   }

+   nir_def *undef = nir_undef(b, 1, 32);
   for (signed i = 0; i < last; ++i) {
      if (ctx.colour[i]) {
-         insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, ctx.depth,
-                         ctx.stencil, ctx.sample_mask, dispatch_width);
+         insert_rt_store(b, i, ctx.colour[i], i > 0 ? ctx.colour[0] : NULL,
+                         ctx.depth ?: undef, ctx.stencil ?: undef,
+                         ctx.sample_mask ?: undef);
      }
   }

-   insert_rt_store(b, devinfo, last, true, last >= 0 ? ctx.colour[last] : NULL,
-                   NULL, ctx.depth, ctx.stencil, ctx.sample_mask,
-                   dispatch_width);
+   insert_rt_store(b, last, last >= 0 ? ctx.colour[last] : NULL,
+                   last > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef,
+                   ctx.stencil ?: undef, ctx.sample_mask ?: undef);
 }

 unsigned