mesa/src/intel/compiler/jay/jay_nir.c
Kenneth Graunke f6debb842d jay: Gripe more clearly about dual source blending
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41535>
2026-05-13 23:03:15 +00:00

470 lines
16 KiB
C

/*
* Copyright 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "compiler/brw/brw_eu.h"
#include "compiler/brw/brw_eu_defines.h"
#include "compiler/brw/brw_nir.h"
#include "compiler/brw/brw_private.h"
#include "compiler/intel_nir.h"
#include "jay_private.h"
#include "nir.h"
#include "nir_builder.h"
#include "nir_intrinsics.h"
/*
* Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has
* its own index. Vectors/64-bit use contiguous indices. We therefore run a
* modified version of nir_index_ssa_defs right before translating NIR->Jay.
*/
static bool
index_ssa_def_cb(nir_def *def, void *state)
{
unsigned *index = (unsigned *) state;
def->index = *index;
*index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32);
return true;
}
static void
nj_index_ssa_defs(nir_shader *nir)
{
nir_foreach_function_impl(impl, nir) {
/* The zero index means null in Jay, so start SSA indices at 1 */
unsigned index = 1;
nir_foreach_block_unstructured(block, impl) {
nir_foreach_instr(instr, block)
nir_foreach_def(instr, index_ssa_def_cb, &index);
}
impl->ssa_alloc = index;
}
}
static bool
lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
{
if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
return false;
/* TODO: Is this right for multisampling? */
b->cursor = nir_before_instr(&intr->instr);
nir_def *active =
nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b)));
nir_def_replace(&intr->def, active);
return true;
}
static bool
lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
{
if (intr->intrinsic != nir_intrinsic_load_frag_coord &&
intr->intrinsic != nir_intrinsic_load_pixel_coord)
return false;
b->cursor = nir_before_instr(&intr->instr);
nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b));
if (intr->intrinsic == nir_intrinsic_load_frag_coord) {
c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)),
nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b),
nir_frcp(b, nir_load_frag_coord_w_rcp(b)));
}
nir_def_replace(&intr->def, c);
return true;
}
static bool
jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
{
b->cursor = nir_after_instr(&intr->instr);
unsigned *simd_width = simd_;
/* mask & -mask isolates the lowest set bit in the mask. */
if (intr->intrinsic == nir_intrinsic_elect) {
nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b));
mask = nir_iand(b, mask, nir_ineg(b, mask));
nir_def_replace(&intr->def, nir_inverse_ballot(b, mask));
return true;
}
/* Ballots must match the SIMD size */
if (intr->intrinsic == nir_intrinsic_ballot ||
intr->intrinsic == nir_intrinsic_ballot_relaxed) {
unsigned old_bitsize = intr->def.bit_size;
intr->def.bit_size = *simd_width;
nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize);
nir_def_rewrite_uses_after(&intr->def, u2uN);
return true;
}
/* Just a constant */
if (intr->intrinsic == nir_intrinsic_load_simd_width_intel) {
nir_def_replace(&intr->def, nir_imm_int(b, *simd_width));
return true;
}
/* Note: we don't treat read_invocation specially because there's little
* benefit but doing so would require expensive uniformizing in some cases.
*/
if (intr->intrinsic != nir_intrinsic_shuffle &&
intr->intrinsic != nir_intrinsic_read_invocation)
return false;
nir_def *data = intr->src[0].ssa;
assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized");
nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4);
nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B));
return true;
}
struct frag_out_ctx {
nir_def *colour[8], *depth, *stencil, *sample_mask;
};
static bool
collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_)
{
struct frag_out_ctx *ctx = ctx_;
if (intr->intrinsic != nir_intrinsic_store_output)
return false;
unsigned wrmask = nir_intrinsic_write_mask(intr);
assert(nir_intrinsic_component(intr) == 0 && "component should be lowered");
assert(util_is_power_of_two_nonzero(wrmask + 1) &&
"complex writemasks should be lowered");
/* TODO: Optimize with write mask? */
gl_frag_result loc = nir_intrinsic_io_semantics(intr).location;
assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo");
assert(loc != FRAG_RESULT_DUAL_SRC_BLEND && "todo");
nir_def **out;
if (loc == FRAG_RESULT_COLOR) {
out = &ctx->colour[0];
} else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
out = &ctx->colour[loc - FRAG_RESULT_DATA0];
} else if (loc == FRAG_RESULT_DEPTH) {
out = &ctx->depth;
} else if (loc == FRAG_RESULT_STENCIL) {
UNREACHABLE("todo");
out = &ctx->stencil;
} else if (loc == FRAG_RESULT_SAMPLE_MASK) {
UNREACHABLE("todo");
out = &ctx->sample_mask;
} else {
UNREACHABLE("invalid location");
}
assert((*out) == NULL && "each location written exactly once");
*out = intr->src[0].ssa;
nir_instr_remove(&intr->instr);
return true;
}
static void
append_payload(nir_builder *b,
nir_def **payload,
unsigned *len,
unsigned max_len,
nir_def *value)
{
if (value != NULL) {
for (unsigned i = 0; i < value->num_components; ++i) {
payload[*len] = nir_channel(b, value, i);
(*len)++;
assert((*len) <= max_len);
}
}
}
static void
insert_rt_store(nir_builder *b,
const struct intel_device_info *devinfo,
signed target,
bool last,
nir_def *colour,
nir_def *src0_alpha,
nir_def *depth,
nir_def *stencil,
nir_def *sample_mask,
unsigned dispatch_width)
{
bool null_rt = target < 0;
target = MAX2(target, 0);
if (!colour) {
colour = nir_undef(b, 4, 32);
}
colour = nir_pad_vec4(b, colour);
if (null_rt) {
/* Even if we don't write a RT, we still need to write alpha for
* alpha-to-coverage and alpha testing. Optimize the other channels out.
*/
colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32),
nir_channel(b, colour, 3), 3);
}
/* TODO: Not sure I like this. We'll see what 2src looks like. */
unsigned op = dispatch_width == 32 ?
XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
uint64_t desc =
brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */);
uint64_t ex_desc = 0;
if (devinfo->ver >= 20) {
ex_desc = target << 21 |
null_rt << 20 |
(src0_alpha ? (1 << 15) : 0) |
(stencil ? (1 << 14) : 0) |
(depth ? (1 << 13) : 0) |
(sample_mask ? (1 << 12) : 0);
} else if (devinfo->ver >= 11) {
/* Set the "Render Target Index" and "Src0 Alpha Present" fields
* in the extended message descriptor, in lieu of using a header.
*/
ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0);
}
/* Build the payload */
nir_def *payload[8] = { NULL };
unsigned len = 0;
append_payload(b, payload, &len, ARRAY_SIZE(payload), colour);
append_payload(b, payload, &len, ARRAY_SIZE(payload), depth);
/* TODO */
nir_def *disable = b->shader->info.fs.uses_discard ?
nir_is_helper_invocation(b, 1) :
nir_imm_false(b);
nir_store_render_target_intel(b, nir_vec(b, payload, len),
nir_imm_ivec2(b, desc, ex_desc), disable,
.eot = last);
}
static void
lower_fragment_outputs(nir_function_impl *impl,
const struct intel_device_info *devinfo,
unsigned nr_color_regions,
unsigned dispatch_width)
{
struct frag_out_ctx ctx = { { NULL } };
nir_function_intrinsics_pass(impl, collect_fragment_output,
nir_metadata_control_flow, &ctx);
nir_builder b_ = nir_builder_at(nir_after_impl(impl));
nir_builder *b = &b_;
assert(nr_color_regions <= ARRAY_SIZE(ctx.colour));
signed last = -1;
for (signed i = nr_color_regions - 1; i >= 0; --i) {
if (ctx.colour[i]) {
last = i;
break;
}
}
for (signed i = 0; i < last; ++i) {
if (ctx.colour[i]) {
insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, ctx.depth,
ctx.stencil, ctx.sample_mask, dispatch_width);
}
}
insert_rt_store(b, devinfo, last, true, last >= 0 ? ctx.colour[last] : NULL,
NULL, ctx.depth, ctx.stencil, ctx.sample_mask,
dispatch_width);
}
unsigned
jay_process_nir(const struct intel_device_info *devinfo,
nir_shader *nir,
union brw_any_prog_data *prog_data,
union brw_any_prog_key *key)
{
enum mesa_shader_stage stage = nir->info.stage;
struct brw_compiler compiler = { .devinfo = devinfo };
unsigned nr_packed_regs = 0;
brw_pass_tracker pt_ = {
.nir = nir,
.key = &key->base,
.dispatch_width = 0,
.compiler = &compiler,
.archiver = NULL, //params->base.archiver,
}, *pt = &pt_;
BRW_NIR_SNAPSHOT("first");
prog_data->base.ray_queries = nir->info.ray_queries;
prog_data->base.stage = stage;
// TODO: Make the driver do this?
// prog_data->base.source_hash = params->source_hash;
prog_data->base.total_shared = nir->info.shared_size;
/* TODO: Real heuristic */
bool do_simd32 = INTEL_SIMD(FS, 32);
do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT;
unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16;
if (stage == MESA_SHADER_VERTEX) {
/* We only expect slot compaction to be disabled when using device
* generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS
* programming. This should always be enabled together with VF component
* packing to minimize the size of the payload.
*/
assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing);
/* When using Primitive Replication for multiview, each view gets its own
* position slot.
*/
const uint32_t pos_slots =
(nir->info.per_view_outputs & VARYING_BIT_POS) ?
MAX2(1, util_bitcount(key->base.view_mask)) :
1;
/* Only position is allowed to be per-view */
assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS));
brw_compute_vue_map(devinfo, &prog_data->vue.vue_map,
nir->info.outputs_written, key->base.vue_layout,
pos_slots);
brw_nir_apply_key(pt, &key->base, simd_width);
prog_data->vs.inputs_read = nir->info.inputs_read;
prog_data->vs.double_inputs_read = nir->info.vs.double_inputs;
prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction;
brw_nir_lower_vs_inputs(nir);
brw_nir_lower_vue_outputs(nir);
BRW_NIR_SNAPSHOT("after_lower_io");
memset(prog_data->vs.vf_component_packing, 0,
sizeof(prog_data->vs.vf_component_packing));
if (key->vs.vf_component_packing) {
nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs);
}
/* Get constant offsets out of the way for proper clip/cull handling */
BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
BRW_NIR_PASS(nir_opt_constant_folding);
BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo,
&prog_data->vue.vue_map, 0, 0);
} else if (stage == MESA_SHADER_FRAGMENT) {
assert(key->fs.mesh_input == INTEL_NEVER && "todo");
brw_nir_apply_key(pt, &key->base, 32);
brw_nir_lower_fs_inputs(nir, devinfo, &key->fs);
brw_nir_lower_fs_outputs(nir);
NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
if (!brw_can_coherent_fb_fetch(devinfo))
NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs);
NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord,
nir_metadata_control_flow, NULL);
NIR_PASS(_, nir, nir_opt_barycentric, true);
lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
key->fs.nr_color_regions, simd_width);
NIR_PASS(_, nir, nir_lower_helper_writes, true);
NIR_PASS(_, nir, nir_lower_is_helper_invocation);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation,
nir_metadata_control_flow, NULL);
if (key->fs.alpha_to_coverage != INTEL_NEVER) {
/* Run constant fold optimization in order to get the correct source
* offset to determine render target 0 store instruction in
* emit_alpha_to_coverage pass.
*/
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage);
}
// TODO
// NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
if (!brw_fs_prog_key_is_dynamic(&key->fs)) {
uint32_t f = 0;
if (key->fs.multisample_fbo == INTEL_ALWAYS)
f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO;
if (key->fs.alpha_to_coverage == INTEL_ALWAYS)
f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE;
if (key->fs.provoking_vertex_last == INTEL_ALWAYS)
f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST;
if (key->fs.persample_interp == INTEL_ALWAYS) {
f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH |
INTEL_FS_CONFIG_PERSAMPLE_INTERP;
}
NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel,
f);
}
} else {
brw_nir_apply_key(pt, &key->base, simd_width);
}
brw_postprocess_nir_opts(pt);
NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd,
nir_metadata_control_flow, &simd_width);
NIR_PASS(_, nir, nir_opt_algebraic_late);
NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16);
/* Late postprocess while remaining in SSA */
/* Run fsign lowering again after the last time brw_nir_optimize is called.
* As is the case with conversion lowering (below), brw_nir_optimize can
* create additional fsign instructions.
*/
NIR_PASS(_, nir, jay_nir_lower_fsign);
NIR_PASS(_, nir, jay_nir_lower_bool);
NIR_PASS(_, nir, nir_opt_cse);
NIR_PASS(_, nir, nir_opt_dce);
NIR_PASS(_, nir, jay_nir_opt_sel_zero);
/* Run nir_split_conversions only after the last tiem
* brw_nir_optimize is called. Various optimizations invoked there can
* rematerialize the conversions that the lowering pass eliminates.
*/
const nir_split_conversions_options split_conv_opts = {
.callback = intel_nir_split_conversions_cb,
};
NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts);
/* Do this only after the last opt_gcm. GCM will undo this lowering. */
if (stage == MESA_SHADER_FRAGMENT) {
NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample);
}
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
NIR_PASS(_, nir, nir_lower_all_phis_to_scalar);
NIR_PASS(_, nir, nir_opt_copy_prop);
NIR_PASS(_, nir, nir_opt_dce);
/* Jay requires LCSSA for correctness reading convergent loop-dependent
* values outside of a divergent loop. Converting to LCSSA inserts the
* required divergent 1-source phi after the loop.
*/
NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
/* Run divergence analysis at the end */
nir_sweep(nir);
nj_index_ssa_defs(nir);
nir_divergence_analysis(nir);
jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs);
return simd_width;
}