From 9cc686ac720c05d935074a4e54f76eca3dc0c83c Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Thu, 11 Jun 2026 12:47:46 -0400 Subject: [PATCH] jay: rewrite demote/terminate/helper/halt handling * implement terminate * fix HALT brokenness on all shader stages (we need a real end block) * optimize demote codegen a ton * optimize gl_HelperInvocation/gl_SampleMask * optimize "all lanes demoted" via HALT.any * optimize scheduling of stores/atomics/demotes in FS * optimize some texturing with helper invocations Signed-off-by: Alyssa Rosenzweig Part-of: --- src/compiler/nir/nir_divergence_analysis.c | 1 - src/compiler/nir/nir_intrinsics.py | 7 +- src/intel/compiler/jay/jay_assign_flags.c | 19 +- src/intel/compiler/jay/jay_from_nir.c | 102 ++++++----- src/intel/compiler/jay/jay_ir.h | 31 +++- src/intel/compiler/jay/jay_lower_helpers.c | 191 +++++++++++++++++++++ src/intel/compiler/jay/jay_lower_pre_ra.c | 2 +- src/intel/compiler/jay/jay_nir.c | 143 ++++----------- src/intel/compiler/jay/jay_opcodes.py | 14 +- src/intel/compiler/jay/jay_opt_propagate.c | 30 ++++ src/intel/compiler/jay/jay_private.h | 4 +- src/intel/compiler/jay/jay_schedule.c | 12 +- src/intel/compiler/jay/jay_simd_width.c | 1 + src/intel/compiler/jay/jay_to_binary.c | 13 +- src/intel/compiler/jay/jay_validate.c | 12 +- src/intel/compiler/jay/meson.build | 1 + 16 files changed, 405 insertions(+), 178 deletions(-) create mode 100644 src/intel/compiler/jay/jay_lower_helpers.c diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index ba29f2f041a..052dd3187f8 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -1039,7 +1039,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_dpas_intel: case nir_intrinsic_convert_cmat_intel: case nir_intrinsic_load_coverage_mask_intel: - case nir_intrinsic_load_dispatch_mask_intel: case nir_intrinsic_isberd_nv: case nir_intrinsic_isbewr_nv: case nir_intrinsic_vild_nv: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 33b90a5dfbc..afbce7a24a2 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2686,9 +2686,6 @@ system_value("simd_width_intel", 1) # IndirectDataStartAddress system_value("indirect_address_intel", 1) -# The dispatch mask as provided in the FS payload. -system_value("dispatch_mask_intel", 1) - # The raw coverage mask as provided in the FS payload. # The semantics of it depend on the HW state. system_value("coverage_mask_intel", 1) @@ -2704,8 +2701,8 @@ intrinsic("load_reloc_const_intel", dest_comp=1, bit_sizes=[32], indices=[PARAM_IDX, BASE], flags=[CAN_ELIMINATE, CAN_REORDER]) # Write a render target -# src[] = { color, dual_color, src0_alpha, omask, depth, stencil, predicate } -intrinsic("store_render_target_intel", [4, 4, 1, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 32, 32, 32, 32, 1]) +# src[] = { color, dual_color, src0_alpha, omask, depth, stencil } +intrinsic("store_render_target_intel", [4, 4, 1, 1, 1, 1], indices=[TARGET], bit_sizes=[32, 32, 32, 32, 32, 32]) # Shuffle with an offset in bytes instead of a lane index. # src[] = { payload, lane offset in bytes } diff --git a/src/intel/compiler/jay/jay_assign_flags.c b/src/intel/compiler/jay/jay_assign_flags.c index 35b20d310f8..67180ea3af1 100644 --- a/src/intel/compiler/jay/jay_assign_flags.c +++ b/src/intel/compiler/jay/jay_assign_flags.c @@ -72,10 +72,15 @@ assign_flag(struct flag_ra *ra, jay_def tmp = jay_alloc_def(ra->b, file, 1); unsigned num_flags = jay_num_regs(ra->b->shader, FLAG); + if (ra->b->shader->helpers_tracked) { + /* Helper tracking uses the last flag by definition */ + num_flags--; + } + tmp.reg = tie ? tie->reg : ballot ? 0 : ((ra->roundrobin++) % num_flags); /* Uniform access (via a UFLAG or an inverse-ballot) would clobber the zero - * for a ballot. We could refine this further but this should be ok for now. + * for a ballot. TODO: This needs to be reworked to get the flag back. */ if (!ballot && tmp.reg == 0 && @@ -84,6 +89,8 @@ assign_flag(struct flag_ra *ra, assert(!tie); tmp.reg = 1; ra->roundrobin++; + + assert(num_flags >= 2); /* XXX: Not always true, FIXME */ } if (jay_index(canonical) < ra->nr_vars) { @@ -193,6 +200,16 @@ assign_block(struct flag_ra *ra) I->type = JAY_TYPE_U32; I->dst = canonicalize_flag(I->dst); continue; + } else if (I->op == JAY_OPCODE_SEND && + jay_send_skip_helpers(I) && + jay_is_no_mask(I)) { + + /* jay_lower_helpers will clobber flag 0 to handle this case, see the + * logic there. Evict whatever was there. + */ + ra->flag_to_global[0] = 0; + assert(!I->predication); + continue; } else if (I->type == JAY_TYPE_U1) { /* Boolean logic turns into bitwise logic on the canonical form */ if (!jay_is_null(I->dst)) { diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c index 67a61f54bad..cb3dbe1fae1 100644 --- a/src/intel/compiler/jay/jay_from_nir.c +++ b/src/intel/compiler/jay/jay_from_nir.c @@ -8,12 +8,14 @@ #include "compiler/brw/brw_eu_defines.h" #include "compiler/brw/brw_nir.h" #include "compiler/brw/brw_sampler.h" +#include "compiler/gen/gen_enums.h" #include "compiler/intel_nir.h" #include "compiler/intel_shader_enums.h" #include "compiler/list.h" #include "intel/dev/intel_debug.h" #include "mda/debug_archiver.h" #include "util/bitscan.h" +#include "util/bitset.h" #include "util/lut.h" #include "util/macros.h" #include "util/u_math.h" @@ -81,10 +83,7 @@ struct nir_to_jay_state { const struct intel_device_info *devinfo; jay_builder bld; - - jay_block *current_block; - jay_block *after_block; - jay_block *break_block; + jay_block *current_block, *after_block, *break_block, *exit_block; unsigned indent; bool needs_final_halt; @@ -832,19 +831,6 @@ scalars_equal(nir_scalar a, nir_scalar b) nir_scalar_as_uint(a) == nir_scalar_as_uint(b)); } -static void -jay_emit_halt_target(struct nir_to_jay_state *nj) -{ - /* This final halt will re-enable the channels which got masked off by first - * HALT. - */ - if (nj->needs_final_halt) { - /* This avoids re-emitting the halt after EOT send */ - nj->needs_final_halt = false; - jay_HALT_TARGET(&nj->bld); - } -} - static void jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) { @@ -860,8 +846,6 @@ jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) const int target = MAX2(((signed) nir_intrinsic_target(intr)), 0); const bool last = !nir_instr_next(&intr->instr); - jay_emit_halt_target(nj); - /* The hardware freaks out if we give it an omask without multisampling. */ if (!b->shader->prog_data->fs.uses_omask) { omask = jay_null(); @@ -941,15 +925,10 @@ jay_emit_fb_write(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) srcs[len++] = jay_extract(packed, i); } - jay_inst *send = - jay_SEND(b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true, - .msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len, - .type = JAY_TYPE_U32, .eot = last, .split = split); - - /* Handle the disable predicate. It is logically inverted. */ - if (!nir_src_is_zero(intr->src[6])) { - jay_add_predicate(b, send, jay_negate(nj_src(intr->src[6]))); - } + jay_SEND(b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true, + .msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len, + .type = JAY_TYPE_U32, .eot = last, .split = split, + .skip_helpers = true); } static enum lsc_data_size @@ -1572,19 +1551,6 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) jay_MOV(b, dst, fs->coverage_mask); break; - case nir_intrinsic_load_dispatch_mask_intel: { - jay_def mask = jay_extract(nj->payload.u0, 15); - - if (nj->s->dispatch_width == 32) { - /* TODO: Optimize */ - jay_def hi = jay_extract(nj->payload.u1, 15); - mask = jay_BFI2_u32(b, 0xffff0000, hi, mask); - } - - jay_MOV(b, dst, mask); - break; - } - case nir_intrinsic_load_subgroup_invocation: { jay_def lid = jay_alloc_def(b, UGPR, s->dispatch_width / 2); jay_LANE_ID_8(b, jay_extract_range(lid, 0, 4)); @@ -1600,8 +1566,16 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) } case nir_intrinsic_demote: + jay_DEMOTE_u32(b, jay_null(), jay_null()); + break; case nir_intrinsic_demote_if: - /* TODO: Already lowered, but need to implement for performance. */ + jay_DEMOTE(b, JAY_TYPE_U1, nj_src(intr->src[0]), 0)->conditional_mod = + GEN_CONDITION_NE; + break; + + case nir_intrinsic_load_helper_invocation: + case nir_intrinsic_is_helper_invocation: + jay_HELPER_SEL(b, dst, 1, 0); break; case nir_intrinsic_ddx: @@ -2455,7 +2429,8 @@ jay_emit_texture(struct nir_to_jay_state *nj, nir_tex_instr *tex) .ex_desc = desc_ex_src, .header = header, .srcs = payload, .nr_srcs = n_sources, .type = JAY_TYPE_U32, .src_type = { src_type }, .dst = tmp, .uniform = payload_uniform, - .bindless = surface_bindless, .pure = true); + .bindless = surface_bindless, .pure = true, + .skip_helpers = tex->skip_helpers); /* If we sampled into a temporary, copy out to the final */ if (residency) { @@ -2484,7 +2459,8 @@ jay_emit_jump(struct nir_to_jay_state *nj, nir_jump_instr *instr) break; case nir_jump_halt: nj->needs_final_halt = true; - jay_HALT(&nj->bld); + jay_block_add_successor(nj->current_block, nj->exit_block, GPR); + jay_HALT(&nj->bld, false); break; case nir_jump_return: /* Should be lowered */ @@ -2754,8 +2730,16 @@ static void jay_emit_eot(struct nir_to_jay_state *nj) { jay_builder *b = &nj->bld; + b->cursor = jay_after_block(nj->exit_block); - jay_emit_halt_target(nj); + /* Jump target for HALT */ + if (nj->needs_final_halt) { + if (nj->s->stage == MESA_SHADER_FRAGMENT) { + assert(nj->s->helpers_tracked); + } else { + jay_HALT_TARGET(&nj->bld); + } + } if (mesa_shader_stage_is_compute(nj->nir->info.stage)) { jay_def u0 = nj->payload.u0; @@ -2773,12 +2757,18 @@ jay_emit_eot(struct nir_to_jay_state *nj) .uniform = true); } else if (nj->nir->info.stage == MESA_SHADER_VERTEX || nj->nir->info.stage == MESA_SHADER_TESS_EVAL) { - jay_block *block = jay_last_block(nj->f); + jay_block *block = jay_last_source_block(nj->f); jay_inst *I = jay_last_inst(block); + assert(!nj->needs_final_halt && "halt not supported with URB"); + /* TODO: What if this isn't the case? Do we need a no-op store...? */ assert(I && I->op == JAY_OPCODE_SEND && jay_send_sfid(I) == GEN_SFID_URB); + + /* Pluck out the final SEND and put it in the exit block */ jay_set_send_eot(I, true); + jay_remove_instruction(I); + jay_builder_insert(b, I); } } @@ -3012,6 +3002,14 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p) } } + /* INIT_HELPERS reads UGPRs but has no SSA write. Therefore to minimize + * pressure, we want to hoist it as much as possible. + */ + if (nj->s->helpers_tracked) { + jay_INIT_HELPERS(&nj->bld, jay_extract(nj->payload.u0, 15), + payload_u1(nj, 15, 1)); + } + for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) { if (!jay_is_null(split[i]) && split_gprs[i].def->file == UGPR) { *(split_gprs[i].def) = @@ -3178,7 +3176,11 @@ jay_from_nir_function(const struct intel_device_info *devinfo, jay_setup_payload(&nj); } + nj.exit_block = jay_create_block(&nj); jay_emit_cf_list(&nj, &impl->body); + jay_block_add_successor(nj.current_block, nj.exit_block, GPR); + + list_addtail(&nj.exit_block->link, &f->blocks); jay_emit_eot(&nj); jay_remove_unreachable_blocks(f); } @@ -3216,8 +3218,9 @@ jay_compile(const struct intel_device_info *devinfo, INTEL_DEBUG(intel_debug_flag_for_shader_stage(nir->info.stage)) && !(nir->info.internal || NIR_DEBUG(PRINT_INTERNAL)); + bool track_helpers = false; unsigned simd_width = - jay_process_nir(devinfo, nir, prog_data, key, archiver); + jay_process_nir(devinfo, nir, prog_data, key, archiver, &track_helpers); if (debug) { /* We can't use nir_print_shader since it reindexes SSA defs. */ @@ -3232,6 +3235,7 @@ jay_compile(const struct intel_device_info *devinfo, s->devinfo = devinfo; s->prog_data = prog_data; s->archiver = archiver; + s->helpers_tracked = track_helpers; nir_foreach_function_impl(impl, nir) { jay_from_nir_function(devinfo, nir, s, impl); @@ -3288,6 +3292,10 @@ jay_compile(const struct intel_device_info *devinfo, JAY_PASS(s, jay_insert_payload_swizzle); } + if (s->stage == MESA_SHADER_FRAGMENT && s->helpers_tracked) { + JAY_PASS(s, jay_lower_helpers); + } + if (!(jay_debug & JAY_DBG_NOOPT)) { /* jay_assign_accumulators uses a conservative liveness analysis for * predication, so assign accumulators before predicating for better diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h index 624fcb20358..f939f7bd771 100644 --- a/src/intel/compiler/jay/jay_ir.h +++ b/src/intel/compiler/jay/jay_ir.h @@ -787,6 +787,12 @@ typedef struct jay_shader { unsigned scratch_size; unsigned payload_gprs, payload_ugprs, push_grfs; + /** + * In a fragment shader, whether a helper invocation flag is tracked. Flag RA + * must reserve the relevant flag. + */ + bool helpers_tracked; + /** * Ralloc linear context. Since we don't typically free as we go, * most allocations should go through this context for efficiency. @@ -1126,7 +1132,7 @@ jay_new_block(jay_function *f) static inline bool jay_op_is_control_flow(enum jay_opcode op) { - return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_LOOP_ONCE; + return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_HALT; } /** @@ -1201,6 +1207,9 @@ jay_first_predecessor(jay_block *block, enum jay_file file) #define jay_foreach_block_rev(f, v) \ list_for_each_entry_rev(jay_block, v, &f->blocks, link) +#define jay_foreach_block_safe_rev(f, v) \ + list_for_each_entry_safe_rev(jay_block, v, &f->blocks, link) + #define jay_foreach_block_from(f, from, v) \ list_for_each_entry_from(jay_block, v, from, &f->blocks, link) @@ -1238,7 +1247,7 @@ jay_first_predecessor(jay_block *block, enum jay_file file) jay_foreach_inst_in_block_safe(block, v) #define jay_foreach_inst_in_func_safe_rev(func, block, v) \ - jay_foreach_block_rev(func, block) \ + jay_foreach_block_safe_rev(func, block) \ jay_foreach_inst_in_block_safe_rev(block, v) #define jay_foreach_inst_in_shader(s, func, inst) \ @@ -1355,6 +1364,15 @@ jay_last_block(jay_function *f) return list_last_entry(&f->blocks, jay_block, link); } +static inline jay_block * +jay_last_source_block(jay_function *f) +{ + if (list_is_empty(&f->blocks) || list_is_singular(&f->blocks)) + return NULL; + else + return list_last_entry(&jay_last_block(f)->link, jay_block, link); +} + static inline jay_inst * jay_last_inst(jay_block *block) { @@ -1373,11 +1391,14 @@ jay_next_block(jay_block *block) static inline void jay_block_add_successor(jay_block *block, jay_block *succ, enum jay_file file) { + /* Prune duplicate successors so the caller doesn't need to worry */ jay_block **succs = jay_successors(block, file); - unsigned i = succs[0] ? 1 : 0; + if (succs[0] == succ || succs[1] == succ) { + return; + } - assert(succ && succs[0] != succ && succs[1] != succ); - assert(succs[i] == NULL && "at most 2 successors"); + unsigned i = succs[0] ? 1 : 0; + assert(succ && succs[i] == NULL && "at most 2 successors"); succs[i] = succ; util_dynarray_append(jay_predecessors(succ, file), block); diff --git a/src/intel/compiler/jay/jay_lower_helpers.c b/src/intel/compiler/jay/jay_lower_helpers.c new file mode 100644 index 00000000000..c8423dcc1aa --- /dev/null +++ b/src/intel/compiler/jay/jay_lower_helpers.c @@ -0,0 +1,191 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#include "compiler/gen/gen_enums.h" +#include "util/list.h" +#include "util/u_dynarray.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +struct ctx { + jay_block *last_source_block; + jay_def helper_flag; + bool halted, uses_terminate; + unsigned instr_left; +}; + +/* + * Takes src, a linked list containing the element pivot in the middle, and dst + * an empty list. Moves all elements up to and including pivot from src to dst, + * leaving the rest in dst. Semantically equivalent to a loop of list_move but + * O(1) time regardless of the position of pivot in the list. + */ +static void +list_partition(struct list_head *src, + struct list_head *dst, + struct list_head *pivot) +{ + /* dst runs from src[0] to pivot */ + dst->next = src->next; + dst->prev = pivot; + dst->next->prev = dst; + + /* src runs from pivot[1:] to end of src */ + src->next = pivot->next; + src->prev = src->prev; + + src->next->prev = src; + pivot->next = dst; + + list_validate(dst); + list_validate(src); +} + +static void +process_block(struct ctx *ctx, jay_builder *b, jay_block *block) +{ + jay_foreach_inst_in_block_safe_rev(block, I) { + b->cursor = jay_before_inst(I); + + if (I->op == JAY_OPCODE_INIT_HELPERS) { + jay_NOT(b, ctx->helper_flag, I->src[0])->type = JAY_TYPE_U16; + + if (!jay_is_null(I->src[1])) { + jay_def hi = ctx->helper_flag; + hi.hi = true; + jay_NOT(b, hi, I->src[1])->type = JAY_TYPE_U16; + } + + jay_remove_instruction(I); + } else if (I->op == JAY_OPCODE_HALT) { + ctx->halted = ctx->uses_terminate = true; + } else if (I->op == JAY_OPCODE_DEMOTE) { + enum gen_condition cond = I->conditional_mod; + jay_def x = I->src[0], y = I->src[1]; + + /* Unconditional discard */ + if (!cond) { + cond = GEN_CONDITION_EQ; + I->type = JAY_TYPE_U32; + x = y = jay_bare_reg(UGPR, 0); + } + + jay_inst *cmp = jay_CMP(b, I->type, cond, ctx->helper_flag, x, y); + jay_add_predicate(b, cmp, jay_negate(ctx->helper_flag)); + jay_remove_instruction(I); + + /* We are allowed to halt after a demote if all lanes are inactive + * for performance, but it's not required for correctness. Only do + * it if it's likely profitable. + * + * We assume a shader either uses SPIR-V demote or terminate, but + * not both. If the shader uses terminate, there will be an actual + * HALT instruction after us so we don't bother with a second HALT + * here. Strictly there's a corner case here if all non-helpers are + * terminated but lanes spawned as helpers are not terminated, but + * this is probably reasonable as a tradeoff. + */ + if (ctx->instr_left > 6 && !ctx->uses_terminate) { + jay_inst *halt = jay_HALT(b, true); + halt = jay_add_predicate(b, halt, ctx->helper_flag); + ctx->halted = true; + + jay_block *split = jay_new_block(b->func); + split->indent = block->indent; + + list_partition(&block->instructions, &split->instructions, + &halt->link); + list_addtail(&split->link, &block->link); + + /* The split block either falls through or jumps to the exit */ + for (unsigned file = GPR; file <= UGPR; ++file) { + jay_foreach_predecessor(block, pred, file) { + jay_block **succs = jay_successors(*pred, file); + unsigned idx = succs[0] == block ? 0 : 1; + succs[idx] = split; + } + } + typed_memcpy(&split->physical_preds, &block->physical_preds, 1); + typed_memcpy(&split->logical_preds, &block->logical_preds, 1); + util_dynarray_init(&block->physical_preds, block); + util_dynarray_init(&block->logical_preds, block); + + jay_block_add_successor(split, block, GPR); + jay_block_add_successor(split, jay_last_block(b->func), GPR); + return; + } + } else if (I->op == JAY_OPCODE_HELPER_SEL) { + jay_SEL(b, JAY_TYPE_U32, I->dst, I->src[0], I->src[1], + ctx->helper_flag); + jay_remove_instruction(I); + } else if (I->op == JAY_OPCODE_SEND && jay_send_skip_helpers(I)) { + if (jay_is_no_mask(I)) { + /* jay_assign_flags ensured this is free for us, see logic there */ + jay_def t = jay_bare_reg(UFLAG, 0); + jay_inst *not = jay_NOT(b, jay_null(), ctx->helper_flag); + not->type = JAY_TYPE_U | b->shader->dispatch_width; + jay_set_conditional_mod(b, not, t, GEN_CONDITION_NE); + jay_add_predicate(b, I, t); + } else { + jay_add_predicate(b, I, jay_negate(ctx->helper_flag)); + } + } + + ++ctx->instr_left; + } +} + +void +jay_lower_helpers(jay_shader *shader) +{ + jay_function *entrypoint = jay_shader_get_entrypoint(shader); + jay_block *exit_block = jay_last_block(entrypoint); + jay_block *last_source_block = jay_last_source_block(entrypoint); + + /* By ABI with jay_assign_flags, the last flag is used to track helpers */ + assert(shader->helpers_tracked); + unsigned helper_flag_no = jay_num_regs(shader, FLAG) - 1; + struct ctx ctx = { .helper_flag = jay_bare_reg(FLAG, helper_flag_no) }; + jay_builder b = jay_init_builder(entrypoint, jay_after_block(exit_block)); + + jay_foreach_block_rev(entrypoint, block) { + process_block(&ctx, &b, block); + } + + /* Fill out the exit block */ + b.cursor = jay_after_block(exit_block); + if (ctx.halted) { + jay_HALT_TARGET(&b); + } + + /* Try to pluck out the last instruction and use it for EOT. This breaks SSA + * dominance invariants but that's why this is a post-RA, post-sched pass. + * Only SWSB has to deal with the resulting mess. + * + * There may be no such send (in case of an unconditional terminate). In that + * case, insert a predicated-out null RT write to use for EOT. + */ + jay_inst *send = jay_last_inst(last_source_block); + if (send && send->op == JAY_OPCODE_SEND && jay_send_eot(send)) { + jay_remove_instruction(send); + jay_builder_insert(&b, send); + } else { + jay_def dummy = jay_bare_reg(GPR, 0); + dummy.num_values_m1 = 4 - 1; + + unsigned op = shader->dispatch_width == 32 ? + XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE : + BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; + uint64_t desc = brw_fb_write_desc(shader->devinfo, 0, op, true, false); + uint64_t ex_desc = (1 << 20) /* null rt */; + + send = jay_SEND(&b, .sfid = GEN_SFID_RENDER_CACHE, .check_tdr = true, + .msg_desc = desc | (ex_desc << 32), .nr_srcs = 1, + .srcs = &dummy, .type = JAY_TYPE_U32, .eot = true); + send = jay_add_predicate(&b, send, jay_negate(ctx.helper_flag)); + } +} diff --git a/src/intel/compiler/jay/jay_lower_pre_ra.c b/src/intel/compiler/jay/jay_lower_pre_ra.c index dae14383201..f2771a85305 100644 --- a/src/intel/compiler/jay/jay_lower_pre_ra.c +++ b/src/intel/compiler/jay/jay_lower_pre_ra.c @@ -96,7 +96,7 @@ try_swap_src01(jay_inst *I) if (I->op == JAY_OPCODE_SEL) { /* sel(a, b, p) = sel(b, a, !p) */ I->src[2].negate ^= true; - } else if (I->op == JAY_OPCODE_CMP) { + } else if (I->op == JAY_OPCODE_CMP || I->op == JAY_OPCODE_DEMOTE) { I->conditional_mod = gen_condition_swap_sources(I->conditional_mod); } else if (I->op == JAY_OPCODE_BFN) { jay_set_bfn_ctrl(I, util_lut3_swap_sources(jay_bfn_ctrl(I), 0, 1)); diff --git a/src/intel/compiler/jay/jay_nir.c b/src/intel/compiler/jay/jay_nir.c index 1e8753bce78..3c23cd86849 100644 --- a/src/intel/compiler/jay/jay_nir.c +++ b/src/intel/compiler/jay/jay_nir.c @@ -46,21 +46,6 @@ nj_index_ssa_defs(nir_shader *nir) } } -static bool -lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_) -{ - if (intr->intrinsic != nir_intrinsic_load_helper_invocation) - return false; - - /* TODO: Is this right for multisampling? */ - b->cursor = nir_before_instr(&intr->instr); - nir_def *active = - nir_inot(b, nir_inverse_ballot(b, nir_load_dispatch_mask_intel(b))); - - nir_def_replace(&intr->def, active); - return true; -} - static bool lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) { @@ -178,8 +163,7 @@ insert_rt_store(nir_builder *b, nir_def *src0_colour, nir_def *depth, nir_def *stencil, - nir_def *sample_mask, - nir_def *disable) + nir_def *sample_mask) { bool null_rt = target < 0; @@ -197,8 +181,7 @@ insert_rt_store(nir_builder *b, nir_def *src0_alpha = nir_channel_or_undef(b, src0_colour ?: colour, 3); nir_store_render_target_intel(b, colour, dual_colour, src0_alpha, - sample_mask, depth, stencil, disable, - .target = target); + sample_mask, depth, stencil, .target = target); } static void @@ -216,14 +199,10 @@ lower_fragment_outputs(nir_function_impl *impl, nir_def *undef = nir_undef(b, 1, 32); - nir_def *disable = b->shader->info.fs.uses_discard ? - nir_is_helper_invocation(b, 1) : - nir_imm_false(b); - if (ctx.dual_blend) { insert_rt_store(b, 0, ctx.colour[0], ctx.colour[1], NULL, ctx.depth ?: undef, ctx.stencil ?: undef, - ctx.sample_mask ?: undef, disable); + ctx.sample_mask ?: undef); return; } @@ -239,83 +218,13 @@ lower_fragment_outputs(nir_function_impl *impl, if (ctx.colour[i]) { insert_rt_store(b, i, ctx.colour[i], NULL, i > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef, - ctx.stencil ?: undef, ctx.sample_mask ?: undef, - disable); + ctx.stencil ?: undef, ctx.sample_mask ?: undef); } } insert_rt_store(b, last, last >= 0 ? ctx.colour[last] : NULL, NULL, last > 0 ? ctx.colour[0] : NULL, ctx.depth ?: undef, - ctx.stencil ?: undef, ctx.sample_mask ?: undef, disable); -} - -/** - * Drop render target stores with unconditional discards. - */ -static bool -opt_unconditional_discards(nir_shader *nir) -{ - nir_function_impl *impl = nir_shader_get_entrypoint(nir); - nir_block *block = nir_impl_last_block(impl); - - bool progress = false; - bool any_remaining_rt_writes = false; - - nir_foreach_instr_reverse_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - if (intr->intrinsic == nir_intrinsic_store_render_target_intel) { - nir_scalar discard = nir_scalar_resolved(intr->src[6].ssa, 0); - if (nir_scalar_is_const(discard) && nir_scalar_as_bool(discard)) { - /* Drop store with unconditional discard */ - nir_instr_remove(instr); - progress = true; - } else { - /* This RT store might actually happen */ - any_remaining_rt_writes = true; - } - } else if ((intr->intrinsic == nir_intrinsic_demote || - intr->intrinsic == nir_intrinsic_terminate) && - !any_remaining_rt_writes) { - /* Delete unconditional demotes/terminates in the end block... */ - nir_instr_remove(instr); - progress = true; - } else { - /* ...but stop if we find an intrinsic that has a side-effect */ - const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; - if (!(info->flags & NIR_INTRINSIC_CAN_ELIMINATE)) - break; - } - } - - /* See if discards still exist in the program and flag accordingly */ - nir->info.fs.uses_discard = false; - - nir_foreach_block(block, impl) { - nir_foreach_instr(instr, block) { - if (instr->type == nir_instr_type_intrinsic) { - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic == nir_intrinsic_demote || - intr->intrinsic == nir_intrinsic_demote_if || - intr->intrinsic == nir_intrinsic_terminate || - intr->intrinsic == nir_intrinsic_terminate_if) - nir->info.fs.uses_discard = true; - } - } - } - - /* If we eliminated all RT stores, add a Null RT store to end the thread. */ - if (!any_remaining_rt_writes) { - nir_builder b = nir_builder_at(nir_after_impl(impl)); - nir_def *undef = nir_undef(&b, 1, 32); - insert_rt_store(&b, -1, NULL, NULL, NULL, undef, undef, undef, - nir_imm_true(&b)); - } - - return nir_progress(progress, impl, nir_metadata_control_flow); + ctx.stencil ?: undef, ctx.sample_mask ?: undef); } unsigned @@ -323,7 +232,8 @@ jay_process_nir(const struct intel_device_info *devinfo, nir_shader *nir, union brw_any_prog_data *prog_data, union brw_any_prog_key *key, - debug_archiver *archiver) + debug_archiver *archiver, + bool *track_helpers) { enum mesa_shader_stage stage = nir->info.stage; struct brw_compiler compiler = { .devinfo = devinfo }; @@ -475,10 +385,14 @@ jay_process_nir(const struct intel_device_info *devinfo, lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo, key->fs.nr_color_regions, simd_width); - JAY_NIR_PASS(nir_lower_helper_writes, true); - JAY_NIR_PASS(nir_lower_is_helper_invocation); - JAY_NIR_PASS(nir_shader_intrinsics_pass, lower_helper_invocation, - nir_metadata_control_flow, NULL); + + /* nir_lower_terminate_to_demote will hamper our ability to schedule + * terminates (since it turns them into real control flow), so run + * nir_opt_move_discards_to_top first as a prepass. That should help + * scheduling demotes too (which is more important). + */ + JAY_NIR_PASS(nir_opt_move_discards_to_top); + JAY_NIR_PASS(nir_lower_terminate_to_demote); if (key->fs.alpha_to_coverage != INTEL_NEVER) { /* Run constant fold optimization in order to get the correct source @@ -495,8 +409,6 @@ jay_process_nir(const struct intel_device_info *devinfo, */ brw_nir_optimize(pt); - NIR_PASS(_, nir, opt_unconditional_discards); - // TODO // JAY_NIR_PASS(brw_nir_move_interpolation_to_top); @@ -556,10 +468,31 @@ jay_process_nir(const struct intel_device_info *devinfo, /* Run divergence analysis at the end */ nir_sweep(nir); - nj_index_ssa_defs(nir); nir_divergence_analysis(nir); - if (stage != MESA_SHADER_FRAGMENT) + if (stage == MESA_SHADER_FRAGMENT) { + /* Certain features require tracking helpers for correctness */ + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + *track_helpers |= nir->info.fs.uses_discard || nir->info.writes_memory; + *track_helpers |= BITSET_TEST(nir->info.system_values_read, + SYSTEM_VALUE_HELPER_INVOCATION); + + /* ...but this is more subtle. nir_opt_load_skip_helpers flags texturing + * operations that we can skip for bandwidth savings. We need divergence + * info for this, so we run late. + * + * We may or may not want to force track_helpers on if this makes + * progress. Possibly driconf'ing on furmark makes sense. + */ + struct nir_opt_load_skip_helpers_options skip_helpers = { + .no_add_divergence = true + }; + JAY_NIR_PASS(nir_opt_load_skip_helpers, &skip_helpers); + } else { jay_populate_prog_data(devinfo, nir, prog_data, key, nr_packed_regs); + } + + /* This must be the very last pass since nir_print itself will reindex! */ + nj_index_ssa_defs(nir); return simd_width; } diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py index 2d19591fe74..c09adbe9b31 100644 --- a/src/intel/compiler/jay/jay_opcodes.py +++ b/src/intel/compiler/jay/jay_opcodes.py @@ -81,7 +81,6 @@ op('bfrev', 1, 'u32', Props.NEGATE) op('cbit', 1, 'u32') op('cmp', 2, 'u32', Props.NEGATE | Props.CMOD) - # With an 8/16-bit type, `index` specifies the element index of the source # within the 32-bit word. For example, if src_type == U16 and index == 1, this # converts the upper 16-bits of the input. @@ -134,9 +133,11 @@ op('schedule_barrier', 0, None, Props.NO_DEST) for n in ['brd', 'illegal', 'goto', 'join', 'if', 'else', 'endif', 'while', 'break', 'cont', 'call', 'calla', 'jmpi', 'ret', - 'loop_once', 'halt', 'halt_target']: + 'loop_once', 'halt_target']: op(n, 0, None, Props.NO_DEST) +op('halt', 0, None, Props.NO_DEST, ['bool predicate_all']) + op('send', 4, None, Props.SIDE_EFFECTS, [ 'gen_sfid sfid', 'uint8_t sbid', @@ -234,6 +235,15 @@ op('dpas', 3, 'u32', 0, [ 'uint8_t pad[3]', ]) +# Initialize helper invocations. Takes 16-bit halves of the dispatch mask. +op('init_helpers', 2, 'u16', Props.NO_DEST) + +# Compare the arguments and demote based on the result. +op('demote', 2, 'u1 u16 u32 u64 s16 s32 s64 f16 f32 f64', Props.NEGATE | Props.NO_DEST) + +# Equivalent to NIR bcsel(@is_helper_invocation, source 0, source 1) +op('helper_sel', 2, 'u1 u32') + OPCODES = _opcodes ENUMS: 'Mapping[str, tuple[str, list[str]]]' = { diff --git a/src/intel/compiler/jay/jay_opt_propagate.c b/src/intel/compiler/jay/jay_opt_propagate.c index 0172c15ad21..4aeaf2b6120 100644 --- a/src/intel/compiler/jay/jay_opt_propagate.c +++ b/src/intel/compiler/jay/jay_opt_propagate.c @@ -3,6 +3,7 @@ * SPDX-License-Identifier: MIT */ +#include "compiler/gen/gen_enums.h" #include "util/bitset.h" #include "util/lut.h" #include "jay_builder.h" @@ -121,6 +122,30 @@ propagate_not(jay_inst *I, unsigned s, jay_inst *mod) } } +/** + * Fuse demote(cmp(x, y) != 0) to demote(x CMP y). + */ +static void +fuse_demote(jay_inst *demote, jay_inst **defs) +{ + if (!(jay_is_ssa(demote->src[0]) && + jay_is_zero(demote->src[1]) && + demote->type == JAY_TYPE_U1 && + demote->conditional_mod == GEN_CONDITION_NE)) { + return; + } + + jay_inst *cmp = defs[jay_index(demote->src[0])]; + if (cmp->op != JAY_OPCODE_CMP || cmp->predication) { + return; + } + + demote->conditional_mod = cmp->conditional_mod; + demote->src[0] = cmp->src[0]; + demote->src[1] = cmp->src[1]; + demote->type = cmp->type; +} + static void propagate_forwards(jay_function *f) { @@ -156,6 +181,11 @@ propagate_forwards(jay_function *f) if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND) continue; + /* We fuse demote forwards & upfront to avoid fighting cmod prop */ + if (I->op == JAY_OPCODE_DEMOTE) { + fuse_demote(I, defs); + } + jay_foreach_ssa_src(I, s) { /* Copy propagate whole vectors */ jay_def src = I->src[s]; diff --git a/src/intel/compiler/jay/jay_private.h b/src/intel/compiler/jay/jay_private.h index c94c7dac7ab..f2288552f60 100644 --- a/src/intel/compiler/jay/jay_private.h +++ b/src/intel/compiler/jay/jay_private.h @@ -34,7 +34,8 @@ unsigned jay_process_nir(const struct intel_device_info *devinfo, nir_shader *nir, union brw_any_prog_data *prog_data, union brw_any_prog_key *key, - debug_archiver *archiver); + debug_archiver *archiver, + bool *track_helpers); void jay_compute_liveness(jay_function *f); void jay_calculate_register_demands(jay_function *f); @@ -84,6 +85,7 @@ void jay_schedule_pressure(jay_shader *s); void jay_lower_pre_ra(jay_shader *s); void jay_lower_post_ra(jay_shader *s); +void jay_lower_helpers(jay_shader *s); void jay_lower_spill(jay_function *func); void jay_lower_simd_width(jay_shader *s); void jay_lower_scoreboard(jay_shader *s); diff --git a/src/intel/compiler/jay/jay_schedule.c b/src/intel/compiler/jay/jay_schedule.c index f5a642e3550..e206ac3ae0c 100644 --- a/src/intel/compiler/jay/jay_schedule.c +++ b/src/intel/compiler/jay/jay_schedule.c @@ -100,9 +100,17 @@ populate_dag(struct sched_ctx *ctx, address = ctx->dag.node; } - /* Serialize side effects for now */ + /* Serialize side effects for now, including SENDs which need to be + * predicated away after a demote. + */ if ((I->op == JAY_OPCODE_SEND && !jay_send_pure(I)) || - I->op == JAY_OPCODE_SCHEDULE_BARRIER) { + I->op == JAY_OPCODE_SCHEDULE_BARRIER || + I->op == JAY_OPCODE_INIT_HELPERS || + I->op == JAY_OPCODE_DEMOTE || + I->op == JAY_OPCODE_HELPER_SEL || + (I->op == JAY_OPCODE_SEND && + func->shader->helpers_tracked && + jay_send_skip_helpers(I))) { jay_dag_add_edge(&ctx->dag, sidefx); sidefx = ctx->dag.node; diff --git a/src/intel/compiler/jay/jay_simd_width.c b/src/intel/compiler/jay/jay_simd_width.c index 8987a50af87..f07ec23b908 100644 --- a/src/intel/compiler/jay/jay_simd_width.c +++ b/src/intel/compiler/jay/jay_simd_width.c @@ -22,6 +22,7 @@ max_simd_width(const jay_shader *shader, const jay_inst *I) I->op == JAY_OPCODE_EXTRACT_BYTE_PER_8LANES || I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS || I->op == JAY_OPCODE_DESWIZZLE_ODD || + I->op == JAY_OPCODE_INIT_HELPERS || I->op == JAY_OPCODE_MUL_32 || I->op == JAY_OPCODE_SHUFFLE || I->op == JAY_OPCODE_ZIP_UGPR16) { diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c index 435c075527a..691bf52de1f 100644 --- a/src/intel/compiler/jay/jay_to_binary.c +++ b/src/intel/compiler/jay/jay_to_binary.c @@ -79,7 +79,7 @@ to_gen_operand( gen_operand R; unsigned reg = d.reg, count = jay_num_values(d); unsigned offset_B = 0, grf = 0; - assert(!hi || d.file == GPR); + assert(!hi || d.file == GPR || d.file == FLAG); if (count && (d.file == GPR || d.file == UGPR)) { struct jay_register_block block = @@ -189,7 +189,8 @@ to_gen_operand( * SIMD1 instructions and are never SIMD split. */ assert(simd_offs == 0 || idx >= 0); - unsigned offs_B = d.reg * (f->shader->dispatch_width / 8); + unsigned offs_B = + (d.reg * (f->shader->dispatch_width / 8)) + (hi ? 2 : 0); R = gen_flag(offs_B / 2); } else if (d.file == J_ADDRESS) { R = gen_address(d.reg); @@ -580,6 +581,14 @@ emit(struct jay_codegen *jc, } break; + case JAY_OPCODE_HALT: + if (jay_halt_predicate_all(I)) { + assert(I->predication); + gen->pred_control = + jc->devinfo->ver >= 20 ? GEN_PREDICATE_XE2_ALL : GEN_PREDICATE_ALLV; + } + break; + case JAY_OPCODE_HALT_TARGET: /* HALT temporarily disables channels, and the same instruction is used * to re-enable them: once all channels are disabled, then they are diff --git a/src/intel/compiler/jay/jay_validate.c b/src/intel/compiler/jay/jay_validate.c index 948cbb3c564..e02620a1c8f 100644 --- a/src/intel/compiler/jay/jay_validate.c +++ b/src/intel/compiler/jay/jay_validate.c @@ -33,7 +33,9 @@ block_state_for_inst(jay_inst *I) if (I->op == JAY_OPCODE_PHI_DST || I->op == JAY_OPCODE_PRELOAD) { return STATE_PHI_DST; } else if (I->op == JAY_OPCODE_PHI_SRC || - (jay_op_is_control_flow(I->op) && I->op != JAY_OPCODE_ELSE)) { + (jay_op_is_control_flow(I->op) && + I->op != JAY_OPCODE_ELSE && + I->op != JAY_OPCODE_HALT_TARGET)) { return STATE_LATE; } else { return STATE_NORMAL; @@ -238,10 +240,6 @@ validate_inst(struct validate_state *validate, jay_inst *I) validate_flagness(validate, I->dst, I->type, "destination"); validate_flagness(validate, I->cond_flag, JAY_TYPE_U1, "cond_flag"); - CHECK(!I->conditional_mod || - !jay_is_null(I->cond_flag) || - I->op == JAY_OPCODE_CSEL); - /* These assumptions are baked into the definition of broadcast_flag and * required to ensure correctness with the lane masking. */ @@ -256,8 +254,10 @@ validate_inst(struct validate_state *validate, jay_inst *I) CHECK(I->cond_flag.file != FLAG || I->dst.file != UGPR); /* Standard modifiers only allowed on some instructions */ - CHECK(!I->conditional_mod || opinfo->cmod || I->op == JAY_OPCODE_CSEL); CHECK(!I->saturate || opinfo->sat); + CHECK(!I->conditional_mod || + (I->op == JAY_OPCODE_CSEL || I->op == JAY_OPCODE_DEMOTE) || + (!jay_is_null(I->cond_flag) && opinfo->cmod)); unsigned num_srcs = I->num_srcs; diff --git a/src/intel/compiler/jay/meson.build b/src/intel/compiler/jay/meson.build index 949127141f6..6618569b7d1 100644 --- a/src/intel/compiler/jay/meson.build +++ b/src/intel/compiler/jay/meson.build @@ -54,6 +54,7 @@ libintel_compiler_jay_files = files( 'jay_ir.h', 'jay_insert_fp_mode.c', 'jay_liveness.c', + 'jay_lower_helpers.c', 'jay_lower_post_ra.c', 'jay_lower_pre_ra.c', 'jay_lower_scoreboard.c',