jay: Move render target store payload/descriptor construction to backend

Constructing the render target store payload is more complex than we can
reasonably handle at the NIR level.  The main reason is that samplemask
and stencil are packed 16-bit and 8-bit parameters, respectively, which
are intermixed with other values that are 32-bit.  In SIMD32 mode, the
packed sub-32-bit values take up fewer registers than normal values.

Currently we also don't specialize the NIR for each FS dispatch width,
and we can't construct the message descriptor without knowing it.

So, we alter nir_intrinsic_store_render_target_intel to take each of
the expected parameters - colour, depth, stencil, samplemask,
src0_alpha, and discard predicate.  We construct the payloads and
descriptors in the backend.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41688>
This commit is contained in:
Kenneth Graunke 2026-04-16 11:04:46 -07:00 committed by Marge Bot
parent bc22a37d98
commit b01d286083
3 changed files with 86 additions and 86 deletions

View file

@ -383,10 +383,6 @@ index("bool", "explicit_coord")
index("bool", "src_is_reg")
index("bool", "dst_is_reg")
# For an Intel render target store, whether this signals end-of-thread. Must be
# the last instruction.
index("bool", "eot")
# The index of the format string used by a printf. (u_printf_info element of the shader)
index("unsigned", "fmt_idx")
# for NV coop matrix - num of matrix in load 1/2/4
@ -2665,8 +2661,8 @@ intrinsic("load_reloc_const_intel", dest_comp=1, bit_sizes=[32],
indices=[PARAM_IDX, BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
# Write a render target
# src[] = { payload, 2x32 descriptor, predicate }
intrinsic("store_render_target_intel", [-1, 2, 1], indices=[EOT], bit_sizes=[32])
# src[] = { color, src0_alpha, omask, depth, stencil, predicate }
intrinsic("store_render_target_intel", [4, 1, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 16, 32, 32, 1])
# Shuffle with an offset in bytes instead of a lane index.
# src[] = { payload, lane offset in bytes }

View file

@ -750,34 +750,88 @@ jay_emit_derivative(jay_builder *b,
jay_negate(jay_QUAD_SWIZZLE_u32(b, val, swz0)));
}
static inline jay_def
optional_src(nir_src nsrc)
{
return nir_src_is_undef(nsrc) ? jay_null() : nj_src(nsrc);
}
static bool
scalars_equal(nir_scalar a, nir_scalar b)
{
return nir_scalar_equal(a, b) ||
(nir_scalar_is_const(a) &&
nir_scalar_is_const(b) &&
nir_scalar_as_uint(a) == nir_scalar_as_uint(b));
}
static void
jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr)
{
jay_def data = nj_src(intr->src[0]);
jay_def srcs[8];
const struct intel_device_info *devinfo = b->shader->devinfo;
jay_def colour = nj_src(intr->src[0]);
jay_def src0_alpha = optional_src(intr->src[1]);
jay_def omask = optional_src(intr->src[2]);
jay_def depth = optional_src(intr->src[3]);
jay_def stencil = optional_src(intr->src[4]);
const bool null_rt = ((signed) nir_intrinsic_target(intr)) < 0;
const int target = MAX2(((signed) nir_intrinsic_target(intr)), 0);
const bool last = !nir_instr_next(&intr->instr);
/* Optimize unconditional discards. Should probably do this in NIR. */
bool trivial =
nir_src_is_const(intr->src[2]) && nir_src_as_bool(intr->src[2]);
/* If our alpha happens to match src0_alpha, we can skip sending it,
* as the hardware will use our alpha in that case.
*/
if (scalars_equal(nir_scalar_resolved(intr->src[1].ssa, 0),
nir_scalar_resolved(intr->src[0].ssa, 3)))
src0_alpha = jay_null();
for (unsigned i = 0; i < nir_src_num_components(intr->src[0]); ++i) {
srcs[i] =
trivial ? jay_UNDEF_u32(b) : jay_as_gpr(b, jay_extract(data, i));
unsigned op = b->shader->dispatch_width == 32 ?
XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
uint64_t desc =
brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
uint64_t ex_desc = (target << 21) |
(null_rt ? (1 << 20) : 0) |
(jay_is_null(src0_alpha) ? 0 : (1 << 15)) |
(jay_is_null(stencil) ? 0 : (1 << 14)) |
(jay_is_null(depth) ? 0 : (1 << 13)) |
(jay_is_null(omask) ? 0 : (1 << 12));
assert((jay_is_null(src0_alpha) || jay_is_null(omask)) &&
"TODO: lower alpha test to discards when samplemask is written");
jay_def srcs[4 + 16 + 4 + 1 + 16];
unsigned len = 0;
if (!jay_is_null(src0_alpha))
srcs[len++] = jay_as_gpr(b, src0_alpha);
assert(jay_is_null(omask) && "TODO: samplemask");
for (unsigned i = 0; i < 4; i++)
srcs[len++] = jay_as_gpr(b, jay_extract(colour, i));
if (!jay_is_null(depth))
srcs[len++] = jay_as_gpr(b, depth);
assert(jay_is_null(stencil) && "TODO: stencil");
/* Optimize out unconditional discards (probably should do this in NIR) */
if (nir_src_is_const(intr->src[5]) && nir_src_as_bool(intr->src[5])) {
for (unsigned i = 0; i < len; i++)
srcs[i] = jay_UNDEF_u32(b);
}
jay_inst *send =
jay_SEND(b, .sfid = BRW_SFID_RENDER_CACHE, .check_tdr = true,
.msg_desc = nir_scalar_as_uint(nir_scalar_chase_movs(
nir_get_scalar(intr->src[1].ssa, 0))) |
(nir_scalar_as_uint(nir_scalar_chase_movs(
nir_get_scalar(intr->src[1].ssa, 1)))
<< 32),
.srcs = srcs, .nr_srcs = nir_src_num_components(intr->src[0]),
.type = JAY_TYPE_U32, .eot = nir_intrinsic_eot(intr));
.msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len,
.type = JAY_TYPE_U32, .eot = last);
/* Handle the disable predicate. It is logically inverted. */
if (!nir_src_is_const(intr->src[2]) || nir_src_as_bool(intr->src[2])) {
jay_add_predicate(b, send, jay_negate(nj_src(intr->src[2])));
if (!nir_src_is_const(intr->src[5]) || nir_src_as_bool(intr->src[5])) {
jay_add_predicate(b, send, jay_negate(nj_src(intr->src[5])));
}
}

View file

@ -166,43 +166,19 @@ collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_)
nir_instr_remove(&intr->instr);
return true;
}
static void
append_payload(nir_builder *b,
nir_def **payload,
unsigned *len,
unsigned max_len,
nir_def *value)
{
if (value != NULL) {
for (unsigned i = 0; i < value->num_components; ++i) {
payload[*len] = nir_channel(b, value, i);
(*len)++;
assert((*len) <= max_len);
}
}
}
static void
insert_rt_store(nir_builder *b,
const struct intel_device_info *devinfo,
signed target,
bool last,
nir_def *colour,
nir_def *src0_alpha,
nir_def *src0_colour,
nir_def *depth,
nir_def *stencil,
nir_def *sample_mask,
unsigned dispatch_width)
nir_def *sample_mask)
{
bool null_rt = target < 0;
target = MAX2(target, 0);
if (!colour) {
colour = nir_undef(b, 4, 32);
}
colour = nir_pad_vec4(b, colour);
colour = nir_pad_vec4(b, colour ?: nir_undef(b, 4, 32));
if (null_rt) {
/* Even if we don't write a RT, we still need to write alpha for
@ -212,42 +188,14 @@ insert_rt_store(nir_builder *b,
nir_channel(b, colour, 3), 3);
}
/* TODO: Not sure I like this. We'll see what 2src looks like. */
unsigned op = dispatch_width == 32 ?
XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
uint64_t desc =
brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
uint64_t ex_desc = 0;
if (devinfo->ver >= 20) {
ex_desc = target << 21 |
null_rt << 20 |
(src0_alpha ? (1 << 15) : 0) |
(stencil ? (1 << 14) : 0) |
(depth ? (1 << 13) : 0) |
(sample_mask ? (1 << 12) : 0);
} else if (devinfo->ver >= 11) {
/* Set the "Render Target Index" and "Src0 Alpha Present" fields
* in the extended message descriptor, in lieu of using a header.
*/
ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0);
}
/* Build the payload */
nir_def *payload[8] = { NULL };
unsigned len = 0;
append_payload(b, payload, &len, ARRAY_SIZE(payload), colour);
append_payload(b, payload, &len, ARRAY_SIZE(payload), depth);
/* TODO */
nir_def *src0_alpha = nir_channel_or_undef(b, src0_colour ?: colour, 3);
nir_def *disable = b->shader->info.fs.uses_discard ?
nir_is_helper_invocation(b, 1) :
nir_imm_false(b);
nir_store_render_target_intel(b, nir_vec(b, payload, len),
nir_imm_ivec2(b, desc, ex_desc), disable,
.eot = last);
nir_store_render_target_intel(b, colour, src0_alpha, sample_mask, depth,
stencil, disable, .target = target);
}
static void
@ -271,16 +219,18 @@ lower_fragment_outputs(nir_function_impl *impl,
}
}
nir_def *undef = nir_undef(b, 1, 32);
for (signed i = 0; i < last; ++i) {
if (ctx.colour[i]) {
insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, ctx.depth,
ctx.stencil, ctx.sample_mask, dispatch_width);
insert_rt_store(b, i, ctx.colour[i], i > 0 ? ctx.colour[0] : NULL,
ctx.depth ?: undef, ctx.stencil ?: undef,
ctx.sample_mask ?: undef);
}
}
insert_rt_store(b, devinfo, last, true, last >= 0 ? ctx.colour[last] : NULL,
NULL, ctx.depth, ctx.stencil, ctx.sample_mask,
dispatch_width);
insert_rt_store(b, last, last >= 0 ? ctx.colour[last] : NULL,
last > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef,
ctx.stencil ?: undef, ctx.sample_mask ?: undef);
}
unsigned