jay: rewrite demote/terminate/helper/halt handling

* implement terminate
* fix HALT brokenness on all shader stages (we need a real end block)
* optimize demote codegen a ton
* optimize gl_HelperInvocation/gl_SampleMask
* optimize "all lanes demoted" via HALT.any
* optimize scheduling of stores/atomics/demotes in FS
* optimize some texturing with helper invocations

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/42097>
This commit is contained in:
Alyssa Rosenzweig 2026-06-11 12:47:46 -04:00 committed by Marge Bot
parent 52d4d47edc
commit 9cc686ac72
16 changed files with 405 additions and 178 deletions

View file

@ -1039,7 +1039,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_dpas_intel:
case nir_intrinsic_convert_cmat_intel:
case nir_intrinsic_load_coverage_mask_intel:
case nir_intrinsic_load_dispatch_mask_intel:
case nir_intrinsic_isberd_nv:
case nir_intrinsic_isbewr_nv:
case nir_intrinsic_vild_nv:

View file

@ -2686,9 +2686,6 @@ system_value("simd_width_intel", 1)
# IndirectDataStartAddress
system_value("indirect_address_intel", 1)
# The dispatch mask as provided in the FS payload.
system_value("dispatch_mask_intel", 1)
# The raw coverage mask as provided in the FS payload.
# The semantics of it depend on the HW state.
system_value("coverage_mask_intel", 1)
@ -2704,8 +2701,8 @@ intrinsic("load_reloc_const_intel", dest_comp=1, bit_sizes=[32],
indices=[PARAM_IDX, BASE], flags=[CAN_ELIMINATE, CAN_REORDER])
# Write a render target
# src[] = { color, dual_color, src0_alpha, omask, depth, stencil, predicate }
intrinsic("store_render_target_intel", [4, 4, 1, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 32, 32, 32, 32, 1])
# src[] = { color, dual_color, src0_alpha, omask, depth, stencil }
intrinsic("store_render_target_intel", [4, 4, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 32, 32, 32, 32])
# Shuffle with an offset in bytes instead of a lane index.
# src[] = { payload, lane offset in bytes }

View file

@ -72,10 +72,15 @@ assign_flag(struct flag_ra *ra,
jay_def tmp = jay_alloc_def(ra->b, file, 1);
unsigned num_flags = jay_num_regs(ra->b->shader, FLAG);
if (ra->b->shader->helpers_tracked) {
/* Helper tracking uses the last flag by definition */
num_flags--;
}
tmp.reg = tie ? tie->reg : ballot ? 0 : ((ra->roundrobin++) % num_flags);
/* Uniform access (via a UFLAG or an inverse-ballot) would clobber the zero
* for a ballot. We could refine this further but this should be ok for now.
* for a ballot. TODO: This needs to be reworked to get the flag back.
*/
if (!ballot &&
tmp.reg == 0 &&
@ -84,6 +89,8 @@ assign_flag(struct flag_ra *ra,
assert(!tie);
tmp.reg = 1;
ra->roundrobin++;
assert(num_flags >= 2); /* XXX: Not always true, FIXME */
}
if (jay_index(canonical) < ra->nr_vars) {
@ -193,6 +200,16 @@ assign_block(struct flag_ra *ra)
I->type = JAY_TYPE_U32;
I->dst = canonicalize_flag(I->dst);
continue;
} else if (I->op == JAY_OPCODE_SEND &&
jay_send_skip_helpers(I) &&
jay_is_no_mask(I)) {
/* jay_lower_helpers will clobber flag 0 to handle this case, see the
* logic there. Evict whatever was there.
*/
ra->flag_to_global[0] = 0;
assert(!I->predication);
continue;
} else if (I->type == JAY_TYPE_U1) {
/* Boolean logic turns into bitwise logic on the canonical form */
if (!jay_is_null(I->dst)) {

View file

@ -8,12 +8,14 @@
#include "compiler/brw/brw_eu_defines.h"
#include "compiler/brw/brw_nir.h"
#include "compiler/brw/brw_sampler.h"
#include "compiler/gen/gen_enums.h"
#include "compiler/intel_nir.h"
#include "compiler/intel_shader_enums.h"
#include "compiler/list.h"
#include "intel/dev/intel_debug.h"
#include "mda/debug_archiver.h"
#include "util/bitscan.h"
#include "util/bitset.h"
#include "util/lut.h"
#include "util/macros.h"
#include "util/u_math.h"
@ -81,10 +83,7 @@ struct nir_to_jay_state {
const struct intel_device_info *devinfo;
jay_builder bld;
jay_block *current_block;
jay_block *after_block;
jay_block *break_block;
jay_block *current_block, *after_block, *break_block, *exit_block;
unsigned indent;
bool needs_final_halt;
@ -832,19 +831,6 @@ scalars_equal(nir_scalar a, nir_scalar b)
nir_scalar_as_uint(a) == nir_scalar_as_uint(b));
}
static void
jay_emit_halt_target(struct nir_to_jay_state *nj)
{
/* This final halt will re-enable the channels which got masked off by first
* HALT.
*/
if (nj->needs_final_halt) {
/* This avoids re-emitting the halt after EOT send */
nj->needs_final_halt = false;
jay_HALT_TARGET(&nj->bld);
}
}
static void
jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
{
@ -860,8 +846,6 @@ jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
const int target = MAX2(((signed) nir_intrinsic_target(intr)), 0);
const bool last = !nir_instr_next(&intr->instr);
jay_emit_halt_target(nj);
/* The hardware freaks out if we give it an omask without multisampling. */
if (!b->shader->prog_data->fs.uses_omask) {
omask = jay_null();
@ -941,15 +925,10 @@ jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
srcs[len++] = jay_extract(packed, i);
}
jay_inst *send =
jay_SEND(b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true,
.msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len,
.type = JAY_TYPE_U32, .eot = last, .split = split);
/* Handle the disable predicate. It is logically inverted. */
if (!nir_src_is_zero(intr->src[6])) {
jay_add_predicate(b, send, jay_negate(nj_src(intr->src[6])));
}
jay_SEND(b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true,
.msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len,
.type = JAY_TYPE_U32, .eot = last, .split = split,
.skip_helpers = true);
}
static enum lsc_data_size
@ -1572,19 +1551,6 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
jay_MOV(b, dst, fs->coverage_mask);
break;
case nir_intrinsic_load_dispatch_mask_intel: {
jay_def mask = jay_extract(nj->payload.u0, 15);
if (nj->s->dispatch_width == 32) {
/* TODO: Optimize */
jay_def hi = jay_extract(nj->payload.u1, 15);
mask = jay_BFI2_u32(b, 0xffff0000, hi, mask);
}
jay_MOV(b, dst, mask);
break;
}
case nir_intrinsic_load_subgroup_invocation: {
jay_def lid = jay_alloc_def(b, UGPR, s->dispatch_width / 2);
jay_LANE_ID_8(b, jay_extract_range(lid, 0, 4));
@ -1600,8 +1566,16 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr)
}
case nir_intrinsic_demote:
jay_DEMOTE_u32(b, jay_null(), jay_null());
break;
case nir_intrinsic_demote_if:
/* TODO: Already lowered, but need to implement for performance. */
jay_DEMOTE(b, JAY_TYPE_U1, nj_src(intr->src[0]), 0)->conditional_mod =
GEN_CONDITION_NE;
break;
case nir_intrinsic_load_helper_invocation:
case nir_intrinsic_is_helper_invocation:
jay_HELPER_SEL(b, dst, 1, 0);
break;
case nir_intrinsic_ddx:
@ -2455,7 +2429,8 @@ jay_emit_texture(struct nir_to_jay_state *nj, nir_tex_instr *tex)
.ex_desc = desc_ex_src, .header = header, .srcs = payload,
.nr_srcs = n_sources, .type = JAY_TYPE_U32,
.src_type = { src_type }, .dst = tmp, .uniform = payload_uniform,
.bindless = surface_bindless, .pure = true);
.bindless = surface_bindless, .pure = true,
.skip_helpers = tex->skip_helpers);
/* If we sampled into a temporary, copy out to the final */
if (residency) {
@ -2484,7 +2459,8 @@ jay_emit_jump(struct nir_to_jay_state *nj, nir_jump_instr *instr)
break;
case nir_jump_halt:
nj->needs_final_halt = true;
jay_HALT(&nj->bld);
jay_block_add_successor(nj->current_block, nj->exit_block, GPR);
jay_HALT(&nj->bld, false);
break;
case nir_jump_return:
/* Should be lowered */
@ -2754,8 +2730,16 @@ static void
jay_emit_eot(struct nir_to_jay_state *nj)
{
jay_builder *b = &nj->bld;
b->cursor = jay_after_block(nj->exit_block);
jay_emit_halt_target(nj);
/* Jump target for HALT */
if (nj->needs_final_halt) {
if (nj->s->stage == MESA_SHADER_FRAGMENT) {
assert(nj->s->helpers_tracked);
} else {
jay_HALT_TARGET(&nj->bld);
}
}
if (mesa_shader_stage_is_compute(nj->nir->info.stage)) {
jay_def u0 = nj->payload.u0;
@ -2773,12 +2757,18 @@ jay_emit_eot(struct nir_to_jay_state *nj)
.uniform = true);
} else if (nj->nir->info.stage == MESA_SHADER_VERTEX ||
nj->nir->info.stage == MESA_SHADER_TESS_EVAL) {
jay_block *block = jay_last_block(nj->f);
jay_block *block = jay_last_source_block(nj->f);
jay_inst *I = jay_last_inst(block);
assert(!nj->needs_final_halt && "halt not supported with URB");
/* TODO: What if this isn't the case? Do we need a no-op store...? */
assert(I && I->op == JAY_OPCODE_SEND && jay_send_sfid(I) == GEN_SFID_URB);
/* Pluck out the final SEND and put it in the exit block */
jay_set_send_eot(I, true);
jay_remove_instruction(I);
jay_builder_insert(b, I);
}
}
@ -3012,6 +3002,14 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
}
}
/* INIT_HELPERS reads UGPRs but has no SSA write. Therefore to minimize
* pressure, we want to hoist it as much as possible.
*/
if (nj->s->helpers_tracked) {
jay_INIT_HELPERS(&nj->bld, jay_extract(nj->payload.u0, 15),
payload_u1(nj, 15, 1));
}
for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) {
if (!jay_is_null(split[i]) && split_gprs[i].def->file == UGPR) {
*(split_gprs[i].def) =
@ -3178,7 +3176,11 @@ jay_from_nir_function(const struct intel_device_info *devinfo,
jay_setup_payload(&nj);
}
nj.exit_block = jay_create_block(&nj);
jay_emit_cf_list(&nj, &impl->body);
jay_block_add_successor(nj.current_block, nj.exit_block, GPR);
list_addtail(&nj.exit_block->link, &f->blocks);
jay_emit_eot(&nj);
jay_remove_unreachable_blocks(f);
}
@ -3216,8 +3218,9 @@ jay_compile(const struct intel_device_info *devinfo,
INTEL_DEBUG(intel_debug_flag_for_shader_stage(nir->info.stage)) &&
!(nir->info.internal || NIR_DEBUG(PRINT_INTERNAL));
bool track_helpers = false;
unsigned simd_width =
jay_process_nir(devinfo, nir, prog_data, key, archiver);
jay_process_nir(devinfo, nir, prog_data, key, archiver, &track_helpers);
if (debug) {
/* We can't use nir_print_shader since it reindexes SSA defs. */
@ -3232,6 +3235,7 @@ jay_compile(const struct intel_device_info *devinfo,
s->devinfo = devinfo;
s->prog_data = prog_data;
s->archiver = archiver;
s->helpers_tracked = track_helpers;
nir_foreach_function_impl(impl, nir) {
jay_from_nir_function(devinfo, nir, s, impl);
@ -3288,6 +3292,10 @@ jay_compile(const struct intel_device_info *devinfo,
JAY_PASS(s, jay_insert_payload_swizzle);
}
if (s->stage == MESA_SHADER_FRAGMENT && s->helpers_tracked) {
JAY_PASS(s, jay_lower_helpers);
}
if (!(jay_debug & JAY_DBG_NOOPT)) {
/* jay_assign_accumulators uses a conservative liveness analysis for
* predication, so assign accumulators before predicating for better

View file

@ -787,6 +787,12 @@ typedef struct jay_shader {
unsigned scratch_size;
unsigned payload_gprs, payload_ugprs, push_grfs;
/**
* In a fragment shader, whether a helper invocation flag is tracked. Flag RA
* must reserve the relevant flag.
*/
bool helpers_tracked;
/**
* Ralloc linear context. Since we don't typically free as we go,
* most allocations should go through this context for efficiency.
@ -1126,7 +1132,7 @@ jay_new_block(jay_function *f)
static inline bool
jay_op_is_control_flow(enum jay_opcode op)
{
return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_LOOP_ONCE;
return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_HALT;
}
/**
@ -1201,6 +1207,9 @@ jay_first_predecessor(jay_block *block, enum jay_file file)
#define jay_foreach_block_rev(f, v) \
list_for_each_entry_rev(jay_block, v, &f->blocks, link)
#define jay_foreach_block_safe_rev(f, v) \
list_for_each_entry_safe_rev(jay_block, v, &f->blocks, link)
#define jay_foreach_block_from(f, from, v) \
list_for_each_entry_from(jay_block, v, from, &f->blocks, link)
@ -1238,7 +1247,7 @@ jay_first_predecessor(jay_block *block, enum jay_file file)
jay_foreach_inst_in_block_safe(block, v)
#define jay_foreach_inst_in_func_safe_rev(func, block, v) \
jay_foreach_block_rev(func, block) \
jay_foreach_block_safe_rev(func, block) \
jay_foreach_inst_in_block_safe_rev(block, v)
#define jay_foreach_inst_in_shader(s, func, inst) \
@ -1355,6 +1364,15 @@ jay_last_block(jay_function *f)
return list_last_entry(&f->blocks, jay_block, link);
}
static inline jay_block *
jay_last_source_block(jay_function *f)
{
if (list_is_empty(&f->blocks) || list_is_singular(&f->blocks))
return NULL;
else
return list_last_entry(&jay_last_block(f)->link, jay_block, link);
}
static inline jay_inst *
jay_last_inst(jay_block *block)
{
@ -1373,11 +1391,14 @@ jay_next_block(jay_block *block)
static inline void
jay_block_add_successor(jay_block *block, jay_block *succ, enum jay_file file)
{
/* Prune duplicate successors so the caller doesn't need to worry */
jay_block **succs = jay_successors(block, file);
unsigned i = succs[0] ? 1 : 0;
if (succs[0] == succ || succs[1] == succ) {
return;
}
assert(succ && succs[0] != succ && succs[1] != succ);
assert(succs[i] == NULL && "at most 2 successors");
unsigned i = succs[0] ? 1 : 0;
assert(succ && succs[i] == NULL && "at most 2 successors");
succs[i] = succ;
util_dynarray_append(jay_predecessors(succ, file), block);

View file

@ -0,0 +1,191 @@
/*
* Copyright 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "compiler/gen/gen_enums.h"
#include "util/list.h"
#include "util/u_dynarray.h"
#include "jay_builder.h"
#include "jay_builder_opcodes.h"
#include "jay_ir.h"
#include "jay_opcodes.h"
#include "jay_private.h"
struct ctx {
jay_block *last_source_block;
jay_def helper_flag;
bool halted, uses_terminate;
unsigned instr_left;
};
/*
* Takes src, a linked list containing the element pivot in the middle, and dst
* an empty list. Moves all elements up to and including pivot from src to dst,
* leaving the rest in dst. Semantically equivalent to a loop of list_move but
* O(1) time regardless of the position of pivot in the list.
*/
static void
list_partition(struct list_head *src,
struct list_head *dst,
struct list_head *pivot)
{
/* dst runs from src[0] to pivot */
dst->next = src->next;
dst->prev = pivot;
dst->next->prev = dst;
/* src runs from pivot[1:] to end of src */
src->next = pivot->next;
src->prev = src->prev;
src->next->prev = src;
pivot->next = dst;
list_validate(dst);
list_validate(src);
}
static void
process_block(struct ctx *ctx, jay_builder *b, jay_block *block)
{
jay_foreach_inst_in_block_safe_rev(block, I) {
b->cursor = jay_before_inst(I);
if (I->op == JAY_OPCODE_INIT_HELPERS) {
jay_NOT(b, ctx->helper_flag, I->src[0])->type = JAY_TYPE_U16;
if (!jay_is_null(I->src[1])) {
jay_def hi = ctx->helper_flag;
hi.hi = true;
jay_NOT(b, hi, I->src[1])->type = JAY_TYPE_U16;
}
jay_remove_instruction(I);
} else if (I->op == JAY_OPCODE_HALT) {
ctx->halted = ctx->uses_terminate = true;
} else if (I->op == JAY_OPCODE_DEMOTE) {
enum gen_condition cond = I->conditional_mod;
jay_def x = I->src[0], y = I->src[1];
/* Unconditional discard */
if (!cond) {
cond = GEN_CONDITION_EQ;
I->type = JAY_TYPE_U32;
x = y = jay_bare_reg(UGPR, 0);
}
jay_inst *cmp = jay_CMP(b, I->type, cond, ctx->helper_flag, x, y);
jay_add_predicate(b, cmp, jay_negate(ctx->helper_flag));
jay_remove_instruction(I);
/* We are allowed to halt after a demote if all lanes are inactive
* for performance, but it's not required for correctness. Only do
* it if it's likely profitable.
*
* We assume a shader either uses SPIR-V demote or terminate, but
* not both. If the shader uses terminate, there will be an actual
* HALT instruction after us so we don't bother with a second HALT
* here. Strictly there's a corner case here if all non-helpers are
* terminated but lanes spawned as helpers are not terminated, but
* this is probably reasonable as a tradeoff.
*/
if (ctx->instr_left > 6 && !ctx->uses_terminate) {
jay_inst *halt = jay_HALT(b, true);
halt = jay_add_predicate(b, halt, ctx->helper_flag);
ctx->halted = true;
jay_block *split = jay_new_block(b->func);
split->indent = block->indent;
list_partition(&block->instructions, &split->instructions,
&halt->link);
list_addtail(&split->link, &block->link);
/* The split block either falls through or jumps to the exit */
for (unsigned file = GPR; file <= UGPR; ++file) {
jay_foreach_predecessor(block, pred, file) {
jay_block **succs = jay_successors(*pred, file);
unsigned idx = succs[0] == block ? 0 : 1;
succs[idx] = split;
}
}
typed_memcpy(&split->physical_preds, &block->physical_preds, 1);
typed_memcpy(&split->logical_preds, &block->logical_preds, 1);
util_dynarray_init(&block->physical_preds, block);
util_dynarray_init(&block->logical_preds, block);
jay_block_add_successor(split, block, GPR);
jay_block_add_successor(split, jay_last_block(b->func), GPR);
return;
}
} else if (I->op == JAY_OPCODE_HELPER_SEL) {
jay_SEL(b, JAY_TYPE_U32, I->dst, I->src[0], I->src[1],
ctx->helper_flag);
jay_remove_instruction(I);
} else if (I->op == JAY_OPCODE_SEND && jay_send_skip_helpers(I)) {
if (jay_is_no_mask(I)) {
/* jay_assign_flags ensured this is free for us, see logic there */
jay_def t = jay_bare_reg(UFLAG, 0);
jay_inst *not = jay_NOT(b, jay_null(), ctx->helper_flag);
not->type = JAY_TYPE_U | b->shader->dispatch_width;
jay_set_conditional_mod(b, not, t, GEN_CONDITION_NE);
jay_add_predicate(b, I, t);
} else {
jay_add_predicate(b, I, jay_negate(ctx->helper_flag));
}
}
++ctx->instr_left;
}
}
void
jay_lower_helpers(jay_shader *shader)
{
jay_function *entrypoint = jay_shader_get_entrypoint(shader);
jay_block *exit_block = jay_last_block(entrypoint);
jay_block *last_source_block = jay_last_source_block(entrypoint);
/* By ABI with jay_assign_flags, the last flag is used to track helpers */
assert(shader->helpers_tracked);
unsigned helper_flag_no = jay_num_regs(shader, FLAG) - 1;
struct ctx ctx = { .helper_flag = jay_bare_reg(FLAG, helper_flag_no) };
jay_builder b = jay_init_builder(entrypoint, jay_after_block(exit_block));
jay_foreach_block_rev(entrypoint, block) {
process_block(&ctx, &b, block);
}
/* Fill out the exit block */
b.cursor = jay_after_block(exit_block);
if (ctx.halted) {
jay_HALT_TARGET(&b);
}
/* Try to pluck out the last instruction and use it for EOT. This breaks SSA
* dominance invariants but that's why this is a post-RA, post-sched pass.
* Only SWSB has to deal with the resulting mess.
*
* There may be no such send (in case of an unconditional terminate). In that
* case, insert a predicated-out null RT write to use for EOT.
*/
jay_inst *send = jay_last_inst(last_source_block);
if (send && send->op == JAY_OPCODE_SEND && jay_send_eot(send)) {
jay_remove_instruction(send);
jay_builder_insert(&b, send);
} else {
jay_def dummy = jay_bare_reg(GPR, 0);
dummy.num_values_m1 = 4 - 1;
unsigned op = shader->dispatch_width == 32 ?
XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE :
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
uint64_t desc = brw_fb_write_desc(shader->devinfo, 0, op, true, false);
uint64_t ex_desc = (1 << 20) /* null rt */;
send = jay_SEND(&b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true,
.msg_desc = desc | (ex_desc << 32), .nr_srcs = 1,
.srcs = &dummy, .type = JAY_TYPE_U32, .eot = true);
send = jay_add_predicate(&b, send, jay_negate(ctx.helper_flag));
}
}

View file

@ -96,7 +96,7 @@ try_swap_src01(jay_inst *I)
if (I->op == JAY_OPCODE_SEL) {
/* sel(a, b, p) = sel(b, a, !p) */
I->src[2].negate ^= true;
} else if (I->op == JAY_OPCODE_CMP) {
} else if (I->op == JAY_OPCODE_CMP || I->op == JAY_OPCODE_DEMOTE) {
I->conditional_mod = gen_condition_swap_sources(I->conditional_mod);
} else if (I->op == JAY_OPCODE_BFN) {
jay_set_bfn_ctrl(I, util_lut3_swap_sources(jay_bfn_ctrl(I), 0, 1));

View file

@ -46,21 +46,6 @@ nj_index_ssa_defs(nir_shader *nir)
}
}
static bool
lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_)
{
if (intr->intrinsic != nir_intrinsic_load_helper_invocation)
return false;
/* TODO: Is this right for multisampling? */
b->cursor = nir_before_instr(&intr->instr);
nir_def *active =
nir_inot(b, nir_inverse_ballot(b, nir_load_dispatch_mask_intel(b)));
nir_def_replace(&intr->def, active);
return true;
}
static bool
lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_)
{
@ -178,8 +163,7 @@ insert_rt_store(nir_builder *b,
nir_def *src0_colour,
nir_def *depth,
nir_def *stencil,
nir_def *sample_mask,
nir_def *disable)
nir_def *sample_mask)
{
bool null_rt = target < 0;
@ -197,8 +181,7 @@ insert_rt_store(nir_builder *b,
nir_def *src0_alpha = nir_channel_or_undef(b, src0_colour ?: colour, 3);
nir_store_render_target_intel(b, colour, dual_colour, src0_alpha,
sample_mask, depth, stencil, disable,
.target = target);
sample_mask, depth, stencil, .target = target);
}
static void
@ -216,14 +199,10 @@ lower_fragment_outputs(nir_function_impl *impl,
nir_def *undef = nir_undef(b, 1, 32);
nir_def *disable = b->shader->info.fs.uses_discard ?
nir_is_helper_invocation(b, 1) :
nir_imm_false(b);
if (ctx.dual_blend) {
insert_rt_store(b, 0, ctx.colour[0], ctx.colour[1], NULL,
ctx.depth ?: undef, ctx.stencil ?: undef,
ctx.sample_mask ?: undef, disable);
ctx.sample_mask ?: undef);
return;
}
@ -239,83 +218,13 @@ lower_fragment_outputs(nir_function_impl *impl,
if (ctx.colour[i]) {
insert_rt_store(b, i, ctx.colour[i], NULL,
i > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef,
ctx.stencil ?: undef, ctx.sample_mask ?: undef,
disable);
ctx.stencil ?: undef, ctx.sample_mask ?: undef);
}
}
insert_rt_store(b, last, last >= 0 ? ctx.colour[last] : NULL, NULL,
last > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef,
ctx.stencil ?: undef, ctx.sample_mask ?: undef, disable);
}
/**
* Drop render target stores with unconditional discards.
*/
static bool
opt_unconditional_discards(nir_shader *nir)
{
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
nir_block *block = nir_impl_last_block(impl);
bool progress = false;
bool any_remaining_rt_writes = false;
nir_foreach_instr_reverse_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic == nir_intrinsic_store_render_target_intel) {
nir_scalar discard = nir_scalar_resolved(intr->src[6].ssa, 0);
if (nir_scalar_is_const(discard) && nir_scalar_as_bool(discard)) {
/* Drop store with unconditional discard */
nir_instr_remove(instr);
progress = true;
} else {
/* This RT store might actually happen */
any_remaining_rt_writes = true;
}
} else if ((intr->intrinsic == nir_intrinsic_demote ||
intr->intrinsic == nir_intrinsic_terminate) &&
!any_remaining_rt_writes) {
/* Delete unconditional demotes/terminates in the end block... */
nir_instr_remove(instr);
progress = true;
} else {
/* ...but stop if we find an intrinsic that has a side-effect */
const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
if (!(info->flags & NIR_INTRINSIC_CAN_ELIMINATE))
break;
}
}
/* See if discards still exist in the program and flag accordingly */
nir->info.fs.uses_discard = false;
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type == nir_instr_type_intrinsic) {
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic == nir_intrinsic_demote ||
intr->intrinsic == nir_intrinsic_demote_if ||
intr->intrinsic == nir_intrinsic_terminate ||
intr->intrinsic == nir_intrinsic_terminate_if)
nir->info.fs.uses_discard = true;
}
}
}
/* If we eliminated all RT stores, add a Null RT store to end the thread. */
if (!any_remaining_rt_writes) {
nir_builder b = nir_builder_at(nir_after_impl(impl));
nir_def *undef = nir_undef(&b, 1, 32);
insert_rt_store(&b, -1, NULL, NULL, NULL, undef, undef, undef,
nir_imm_true(&b));
}
return nir_progress(progress, impl, nir_metadata_control_flow);
ctx.stencil ?: undef, ctx.sample_mask ?: undef);
}
unsigned
@ -323,7 +232,8 @@ jay_process_nir(const struct intel_device_info *devinfo,
nir_shader *nir,
union brw_any_prog_data *prog_data,
union brw_any_prog_key *key,
debug_archiver *archiver)
debug_archiver *archiver,
bool *track_helpers)
{
enum mesa_shader_stage stage = nir->info.stage;
struct brw_compiler compiler = { .devinfo = devinfo };
@ -475,10 +385,14 @@ jay_process_nir(const struct intel_device_info *devinfo,
lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo,
key->fs.nr_color_regions, simd_width);
JAY_NIR_PASS(nir_lower_helper_writes, true);
JAY_NIR_PASS(nir_lower_is_helper_invocation);
JAY_NIR_PASS(nir_shader_intrinsics_pass, lower_helper_invocation,
nir_metadata_control_flow, NULL);
/* nir_lower_terminate_to_demote will hamper our ability to schedule
* terminates (since it turns them into real control flow), so run
* nir_opt_move_discards_to_top first as a prepass. That should help
* scheduling demotes too (which is more important).
*/
JAY_NIR_PASS(nir_opt_move_discards_to_top);
JAY_NIR_PASS(nir_lower_terminate_to_demote);
if (key->fs.alpha_to_coverage != INTEL_NEVER) {
/* Run constant fold optimization in order to get the correct source
@ -495,8 +409,6 @@ jay_process_nir(const struct intel_device_info *devinfo,
*/
brw_nir_optimize(pt);
NIR_PASS(_, nir, opt_unconditional_discards);
// TODO
// JAY_NIR_PASS(brw_nir_move_interpolation_to_top);
@ -556,10 +468,31 @@ jay_process_nir(const struct intel_device_info *devinfo,
/* Run divergence analysis at the end */
nir_sweep(nir);
nj_index_ssa_defs(nir);
nir_divergence_analysis(nir);
if (stage != MESA_SHADER_FRAGMENT)
if (stage == MESA_SHADER_FRAGMENT) {
/* Certain features require tracking helpers for correctness */
nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
*track_helpers |= nir->info.fs.uses_discard || nir->info.writes_memory;
*track_helpers |= BITSET_TEST(nir->info.system_values_read,
SYSTEM_VALUE_HELPER_INVOCATION);
/* ...but this is more subtle. nir_opt_load_skip_helpers flags texturing
* operations that we can skip for bandwidth savings. We need divergence
* info for this, so we run late.
*
* We may or may not want to force track_helpers on if this makes
* progress. Possibly driconf'ing on furmark makes sense.
*/
struct nir_opt_load_skip_helpers_options skip_helpers = {
.no_add_divergence = true
};
JAY_NIR_PASS(nir_opt_load_skip_helpers, &skip_helpers);
} else {
jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs);
}
/* This must be the very last pass since nir_print itself will reindex! */
nj_index_ssa_defs(nir);
return simd_width;
}

View file

@ -81,7 +81,6 @@ op('bfrev', 1, 'u32', Props.NEGATE)
op('cbit', 1, 'u32')
op('cmp', 2, 'u32', Props.NEGATE | Props.CMOD)
# With an 8/16-bit type, `index` specifies the element index of the source
# within the 32-bit word. For example, if src_type == U16 and index == 1, this
# converts the upper 16-bits of the input.
@ -134,9 +133,11 @@ op('schedule_barrier', 0, None, Props.NO_DEST)
for n in ['brd', 'illegal', 'goto', 'join', 'if', 'else',
'endif', 'while', 'break', 'cont', 'call', 'calla', 'jmpi', 'ret',
'loop_once', 'halt', 'halt_target']:
'loop_once', 'halt_target']:
op(n, 0, None, Props.NO_DEST)
op('halt', 0, None, Props.NO_DEST, ['bool predicate_all'])
op('send', 4, None, Props.SIDE_EFFECTS, [
'gen_sfid sfid',
'uint8_t sbid',
@ -234,6 +235,15 @@ op('dpas', 3, 'u32', 0, [
'uint8_t pad[3]',
])
# Initialize helper invocations. Takes 16-bit halves of the dispatch mask.
op('init_helpers', 2, 'u16', Props.NO_DEST)
# Compare the arguments and demote based on the result.
op('demote', 2, 'u1 u16 u32 u64 s16 s32 s64 f16 f32 f64', Props.NEGATE | Props.NO_DEST)
# Equivalent to NIR bcsel(@is_helper_invocation, source 0, source 1)
op('helper_sel', 2, 'u1 u32')
OPCODES = _opcodes
ENUMS: 'Mapping[str, tuple[str, list[str]]]' = {

View file

@ -3,6 +3,7 @@
* SPDX-License-Identifier: MIT
*/
#include "compiler/gen/gen_enums.h"
#include "util/bitset.h"
#include "util/lut.h"
#include "jay_builder.h"
@ -121,6 +122,30 @@ propagate_not(jay_inst *I, unsigned s, jay_inst *mod)
}
}
/**
* Fuse demote(cmp(x, y) != 0) to demote(x CMP y).
*/
static void
fuse_demote(jay_inst *demote, jay_inst **defs)
{
if (!(jay_is_ssa(demote->src[0]) &&
jay_is_zero(demote->src[1]) &&
demote->type == JAY_TYPE_U1 &&
demote->conditional_mod == GEN_CONDITION_NE)) {
return;
}
jay_inst *cmp = defs[jay_index(demote->src[0])];
if (cmp->op != JAY_OPCODE_CMP || cmp->predication) {
return;
}
demote->conditional_mod = cmp->conditional_mod;
demote->src[0] = cmp->src[0];
demote->src[1] = cmp->src[1];
demote->type = cmp->type;
}
static void
propagate_forwards(jay_function *f)
{
@ -156,6 +181,11 @@ propagate_forwards(jay_function *f)
if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND)
continue;
/* We fuse demote forwards & upfront to avoid fighting cmod prop */
if (I->op == JAY_OPCODE_DEMOTE) {
fuse_demote(I, defs);
}
jay_foreach_ssa_src(I, s) {
/* Copy propagate whole vectors */
jay_def src = I->src[s];

View file

@ -34,7 +34,8 @@ unsigned jay_process_nir(const struct intel_device_info *devinfo,
nir_shader *nir,
union brw_any_prog_data *prog_data,
union brw_any_prog_key *key,
debug_archiver *archiver);
debug_archiver *archiver,
bool *track_helpers);
void jay_compute_liveness(jay_function *f);
void jay_calculate_register_demands(jay_function *f);
@ -84,6 +85,7 @@ void jay_schedule_pressure(jay_shader *s);
void jay_lower_pre_ra(jay_shader *s);
void jay_lower_post_ra(jay_shader *s);
void jay_lower_helpers(jay_shader *s);
void jay_lower_spill(jay_function *func);
void jay_lower_simd_width(jay_shader *s);
void jay_lower_scoreboard(jay_shader *s);

View file

@ -100,9 +100,17 @@ populate_dag(struct sched_ctx *ctx,
address = ctx->dag.node;
}
/* Serialize side effects for now */
/* Serialize side effects for now, including SENDs which need to be
* predicated away after a demote.
*/
if ((I->op == JAY_OPCODE_SEND && !jay_send_pure(I)) ||
I->op == JAY_OPCODE_SCHEDULE_BARRIER) {
I->op == JAY_OPCODE_SCHEDULE_BARRIER ||
I->op == JAY_OPCODE_INIT_HELPERS ||
I->op == JAY_OPCODE_DEMOTE ||
I->op == JAY_OPCODE_HELPER_SEL ||
(I->op == JAY_OPCODE_SEND &&
func->shader->helpers_tracked &&
jay_send_skip_helpers(I))) {
jay_dag_add_edge(&ctx->dag, sidefx);
sidefx = ctx->dag.node;

View file

@ -22,6 +22,7 @@ max_simd_width(const jay_shader *shader, const jay_inst *I)
I->op == JAY_OPCODE_EXTRACT_BYTE_PER_8LANES ||
I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS ||
I->op == JAY_OPCODE_DESWIZZLE_ODD ||
I->op == JAY_OPCODE_INIT_HELPERS ||
I->op == JAY_OPCODE_MUL_32 ||
I->op == JAY_OPCODE_SHUFFLE ||
I->op == JAY_OPCODE_ZIP_UGPR16) {

View file

@ -79,7 +79,7 @@ to_gen_operand(
gen_operand R;
unsigned reg = d.reg, count = jay_num_values(d);
unsigned offset_B = 0, grf = 0;
assert(!hi || d.file == GPR);
assert(!hi || d.file == GPR || d.file == FLAG);
if (count && (d.file == GPR || d.file == UGPR)) {
struct jay_register_block block =
@ -189,7 +189,8 @@ to_gen_operand(
* SIMD1 instructions and are never SIMD split.
*/
assert(simd_offs == 0 || idx >= 0);
unsigned offs_B = d.reg * (f->shader->dispatch_width / 8);
unsigned offs_B =
(d.reg * (f->shader->dispatch_width / 8)) + (hi ? 2 : 0);
R = gen_flag(offs_B / 2);
} else if (d.file == J_ADDRESS) {
R = gen_address(d.reg);
@ -580,6 +581,14 @@ emit(struct jay_codegen *jc,
}
break;
case JAY_OPCODE_HALT:
if (jay_halt_predicate_all(I)) {
assert(I->predication);
gen->pred_control =
jc->devinfo->ver >= 20 ? GEN_PREDICATE_XE2_ALL : GEN_PREDICATE_ALLV;
}
break;
case JAY_OPCODE_HALT_TARGET:
/* HALT temporarily disables channels, and the same instruction is used
* to re-enable them: once all channels are disabled, then they are

View file

@ -33,7 +33,9 @@ block_state_for_inst(jay_inst *I)
if (I->op == JAY_OPCODE_PHI_DST || I->op == JAY_OPCODE_PRELOAD) {
return STATE_PHI_DST;
} else if (I->op == JAY_OPCODE_PHI_SRC ||
(jay_op_is_control_flow(I->op) && I->op != JAY_OPCODE_ELSE)) {
(jay_op_is_control_flow(I->op) &&
I->op != JAY_OPCODE_ELSE &&
I->op != JAY_OPCODE_HALT_TARGET)) {
return STATE_LATE;
} else {
return STATE_NORMAL;
@ -238,10 +240,6 @@ validate_inst(struct validate_state *validate, jay_inst *I)
validate_flagness(validate, I->dst, I->type, "destination");
validate_flagness(validate, I->cond_flag, JAY_TYPE_U1, "cond_flag");
CHECK(!I->conditional_mod ||
!jay_is_null(I->cond_flag) ||
I->op == JAY_OPCODE_CSEL);
/* These assumptions are baked into the definition of broadcast_flag and
* required to ensure correctness with the lane masking.
*/
@ -256,8 +254,10 @@ validate_inst(struct validate_state *validate, jay_inst *I)
CHECK(I->cond_flag.file != FLAG || I->dst.file != UGPR);
/* Standard modifiers only allowed on some instructions */
CHECK(!I->conditional_mod || opinfo->cmod || I->op == JAY_OPCODE_CSEL);
CHECK(!I->saturate || opinfo->sat);
CHECK(!I->conditional_mod ||
(I->op == JAY_OPCODE_CSEL || I->op == JAY_OPCODE_DEMOTE) ||
(!jay_is_null(I->cond_flag) && opinfo->cmod));
unsigned num_srcs = I->num_srcs;

View file

@ -54,6 +54,7 @@ libintel_compiler_jay_files = files(
'jay_ir.h',
'jay_insert_fp_mode.c',
'jay_liveness.c',
'jay_lower_helpers.c',
'jay_lower_post_ra.c',
'jay_lower_pre_ra.c',
'jay_lower_scoreboard.c',